[llvm-branch-commits] [llvm] [AMDGPU] Codegen support for constrained multi-dword sloads (PR #96163)
Christudasan Devadasan via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Jun 20 03:27:24 PDT 2024
https://github.com/cdevadas created https://github.com/llvm/llvm-project/pull/96163
For targets that support xnack replay feature (gfx8+), the
multi-dword scalar loads shouldn't clobber any register that
holds the src address. The constraint version of the scalar
loads have the early clobber flag attached to the dst operand
to restrict RA from re-allocating any of the src regs for its
dst operand.
>From 0dde26d55af778d7b43a3daac1bd36029db0a918 Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <Christudasan.Devadasan at amd.com>
Date: Thu, 20 Jun 2024 10:21:32 +0000
Subject: [PATCH] [AMDGPU] Codegen support for constrained multi-dword sloads
For targets that support xnack replay feature (gfx8+), the
multi-dword scalar loads shouldn't clobber any register that
holds the src address. The constraint version of the scalar
loads have the early clobber flag attached to the dst operand
to restrict RA from re-allocating any of the src regs for its
dst operand.
---
llvm/lib/Target/AMDGPU/SMInstructions.td | 116 +-
.../CodeGen/AMDGPU/GlobalISel/addsubu64.ll | 20 +-
.../AMDGPU/GlobalISel/bool-legalization.ll | 8 +-
.../AMDGPU/GlobalISel/cvt_f32_ubyte.ll | 10 +-
.../AMDGPU/GlobalISel/fp-atomics-gfx940.ll | 8 +-
.../AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll | 244 +-
llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll | 234 +-
.../GlobalISel/inst-select-fract.f64.mir | 12 +-
.../GlobalISel/inst-select-load-constant.mir | 72 +-
.../AMDGPU/GlobalISel/lds-zero-initializer.ll | 2 +-
.../GlobalISel/llvm.amdgcn.div.scale.ll | 405 +-
.../GlobalISel/llvm.amdgcn.intersect_ray.ll | 203 +-
.../GlobalISel/llvm.amdgcn.mfma.gfx90a.ll | 4 +-
.../AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll | 42 +-
.../GlobalISel/llvm.amdgcn.set.inactive.ll | 180 +-
.../GlobalISel/llvm.amdgcn.update.dpp.ll | 128 +-
.../AMDGPU/GlobalISel/load-constant.96.ll | 20 +-
.../AMDGPU/GlobalISel/mul-known-bits.i64.ll | 66 +-
llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll | 164 +-
.../test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll | 278 +-
.../test/CodeGen/AMDGPU/GlobalISel/udivrem.ll | 527 +-
.../GlobalISel/widen-i8-i16-scalar-loads.ll | 292 +-
llvm/test/CodeGen/AMDGPU/add.ll | 545 +-
llvm/test/CodeGen/AMDGPU/add.v2i16.ll | 268 +-
llvm/test/CodeGen/AMDGPU/amd.endpgm.ll | 16 +-
.../AMDGPU/amdgcn-load-offset-from-reg.ll | 4 +-
.../AMDGPU/amdgpu-codegenprepare-idiv.ll | 1225 ++--
llvm/test/CodeGen/AMDGPU/and.ll | 12 +-
llvm/test/CodeGen/AMDGPU/anyext.ll | 22 +-
.../AMDGPU/atomic_optimizations_buffer.ll | 524 +-
.../atomic_optimizations_global_pointer.ll | 2005 ++++---
.../atomic_optimizations_local_pointer.ll | 2155 +++----
.../AMDGPU/atomic_optimizations_raw_buffer.ll | 456 +-
.../atomic_optimizations_struct_buffer.ll | 526 +-
llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll | 56 +-
llvm/test/CodeGen/AMDGPU/bfe-combine.ll | 56 +-
llvm/test/CodeGen/AMDGPU/bfe-patterns.ll | 72 +-
llvm/test/CodeGen/AMDGPU/bfi_int.ll | 136 +-
llvm/test/CodeGen/AMDGPU/bfm.ll | 12 +-
llvm/test/CodeGen/AMDGPU/bitreverse.ll | 140 +-
llvm/test/CodeGen/AMDGPU/br_cc.f16.ll | 74 +-
.../branch-folding-implicit-def-subreg.ll | 2 +-
llvm/test/CodeGen/AMDGPU/bswap.ll | 156 +-
llvm/test/CodeGen/AMDGPU/build_vector.ll | 44 +-
.../CodeGen/AMDGPU/calling-conventions.ll | 324 +-
.../test/CodeGen/AMDGPU/carryout-selection.ll | 850 +--
llvm/test/CodeGen/AMDGPU/clamp-modifier.ll | 418 +-
llvm/test/CodeGen/AMDGPU/clamp.ll | 1334 ++---
.../CodeGen/AMDGPU/combine-cond-add-sub.ll | 30 +-
.../CodeGen/AMDGPU/combine-vload-extract.ll | 22 +-
llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll | 257 +-
llvm/test/CodeGen/AMDGPU/copy_to_scc.ll | 22 +-
llvm/test/CodeGen/AMDGPU/ctlz.ll | 54 +-
llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll | 86 +-
llvm/test/CodeGen/AMDGPU/ctpop16.ll | 52 +-
llvm/test/CodeGen/AMDGPU/ctpop64.ll | 76 +-
llvm/test/CodeGen/AMDGPU/cttz.ll | 32 +-
llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll | 48 +-
llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll | 28 +-
.../CodeGen/AMDGPU/dag-divergence-atomic.ll | 305 +-
.../AMDGPU/divergence-driven-buildvector.ll | 82 +-
llvm/test/CodeGen/AMDGPU/ds-alignment.ll | 270 +-
.../AMDGPU/ds-combine-with-dependence.ll | 2 +-
llvm/test/CodeGen/AMDGPU/ds_write2.ll | 46 +-
.../CodeGen/AMDGPU/extract_vector_dynelt.ll | 16 +-
.../CodeGen/AMDGPU/extract_vector_elt-f16.ll | 122 +-
llvm/test/CodeGen/AMDGPU/fabs.ll | 24 +-
llvm/test/CodeGen/AMDGPU/fadd.f16.ll | 422 +-
llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll | 722 +--
llvm/test/CodeGen/AMDGPU/fcmp.f16.ll | 638 +-
llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 415 +-
llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll | 356 +-
llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll | 310 +-
llvm/test/CodeGen/AMDGPU/fdiv.f16.ll | 398 +-
llvm/test/CodeGen/AMDGPU/fdiv.ll | 107 +-
.../CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll | 184 +-
llvm/test/CodeGen/AMDGPU/flat_atomics.ll | 336 +-
.../CodeGen/AMDGPU/flat_atomics_i32_system.ll | 120 +-
llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll | 2234 +++----
.../CodeGen/AMDGPU/flat_atomics_i64_system.ll | 324 +-
llvm/test/CodeGen/AMDGPU/fma-combine.ll | 162 +-
llvm/test/CodeGen/AMDGPU/fmax3.ll | 448 +-
llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll | 40 +-
llvm/test/CodeGen/AMDGPU/fmaximum.ll | 8 +-
llvm/test/CodeGen/AMDGPU/fmed3.ll | 2790 ++++-----
llvm/test/CodeGen/AMDGPU/fmin3.ll | 664 +--
llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll | 80 +-
llvm/test/CodeGen/AMDGPU/fminimum.ll | 8 +-
llvm/test/CodeGen/AMDGPU/fmul.f16.ll | 342 +-
llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll | 752 +--
llvm/test/CodeGen/AMDGPU/fnearbyint.ll | 90 +-
llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll | 52 +-
llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll | 84 +-
llvm/test/CodeGen/AMDGPU/fneg-fabs.ll | 48 +-
llvm/test/CodeGen/AMDGPU/fneg.ll | 274 +-
llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll | 20 +-
llvm/test/CodeGen/AMDGPU/fp-classify.ll | 282 +-
.../AMDGPU/fp-min-max-buffer-atomics.ll | 100 +-
.../AMDGPU/fp-min-max-buffer-ptr-atomics.ll | 36 +-
.../CodeGen/AMDGPU/fp64-atomics-gfx90a.ll | 262 +-
.../AMDGPU/fp64-min-max-buffer-atomics.ll | 56 +-
.../AMDGPU/fp64-min-max-buffer-ptr-atomics.ll | 28 +-
llvm/test/CodeGen/AMDGPU/fp_to_sint.ll | 126 +-
llvm/test/CodeGen/AMDGPU/fp_to_uint.ll | 104 +-
llvm/test/CodeGen/AMDGPU/fpext.f16.ll | 586 +-
llvm/test/CodeGen/AMDGPU/fptosi.f16.ll | 266 +-
llvm/test/CodeGen/AMDGPU/fptoui.f16.ll | 266 +-
llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll | 972 ++--
llvm/test/CodeGen/AMDGPU/fptrunc.ll | 812 +--
llvm/test/CodeGen/AMDGPU/frem.ll | 196 +-
llvm/test/CodeGen/AMDGPU/fshl.ll | 120 +-
llvm/test/CodeGen/AMDGPU/fshr.ll | 48 +-
llvm/test/CodeGen/AMDGPU/fsub.f16.ll | 252 +-
llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll | 42 +-
.../global-atomics-fp-wrong-subtarget.ll | 11 +-
.../CodeGen/AMDGPU/global-i16-load-store.ll | 16 +-
.../AMDGPU/global-load-saddr-to-vaddr.ll | 20 +-
llvm/test/CodeGen/AMDGPU/global_atomics.ll | 650 +--
.../AMDGPU/global_atomics_i32_system.ll | 180 +-
.../test/CodeGen/AMDGPU/global_atomics_i64.ll | 2646 ++++-----
.../AMDGPU/global_atomics_i64_system.ll | 564 +-
.../AMDGPU/global_atomics_scan_fadd.ll | 2044 +++----
.../AMDGPU/global_atomics_scan_fmax.ll | 1188 ++--
.../AMDGPU/global_atomics_scan_fmin.ll | 1188 ++--
.../AMDGPU/global_atomics_scan_fsub.ll | 1612 ++---
.../identical-subrange-spill-infloop.ll | 30 +-
llvm/test/CodeGen/AMDGPU/idiv-licm.ll | 507 +-
llvm/test/CodeGen/AMDGPU/idot2.ll | 367 +-
llvm/test/CodeGen/AMDGPU/idot4s.ll | 607 +-
llvm/test/CodeGen/AMDGPU/idot4u.ll | 865 +--
llvm/test/CodeGen/AMDGPU/idot8s.ll | 258 +-
llvm/test/CodeGen/AMDGPU/idot8u.ll | 259 +-
llvm/test/CodeGen/AMDGPU/imm.ll | 642 +-
.../AMDGPU/indirect-addressing-term.ll | 2 +-
.../CodeGen/AMDGPU/insert_vector_dynelt.ll | 442 +-
llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll | 14 +-
.../insert_waitcnt_for_precise_memory.ll | 96 +-
llvm/test/CodeGen/AMDGPU/kernel-args.ll | 564 +-
.../CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll | 380 +-
.../CodeGen/AMDGPU/lds-zero-initializer.ll | 2 +-
.../CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll | 124 +-
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll | 4 +-
.../CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll | 1072 ++--
.../CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll | 1860 +++---
.../AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll | 37 +-
.../AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll | 32 +-
.../AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll | 28 +-
...vm.amdgcn.global.atomic.ordered.add.b64.ll | 27 +-
.../AMDGPU/llvm.amdgcn.global.load.tr-w32.ll | 12 +-
.../AMDGPU/llvm.amdgcn.global.load.tr-w64.ll | 12 +-
.../CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll | 736 +--
.../CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll | 1268 ++--
.../AMDGPU/llvm.amdgcn.intersect_ray.ll | 66 +-
.../CodeGen/AMDGPU/llvm.amdgcn.permlane.ll | 224 +-
.../AMDGPU/llvm.amdgcn.permlane16.var.ll | 192 +-
.../CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll | 20 +-
.../llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll | 100 +-
.../llvm.amdgcn.raw.tbuffer.store.d16.ll | 150 +-
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll | 310 +-
.../CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll | 310 +-
.../AMDGPU/llvm.amdgcn.s.barrier.wait.ll | 334 +-
.../CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll | 96 +-
.../AMDGPU/llvm.amdgcn.set.inactive.ll | 214 +-
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll | 596 +-
llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll | 120 +-
llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll | 56 +-
llvm/test/CodeGen/AMDGPU/llvm.exp.ll | 209 +-
llvm/test/CodeGen/AMDGPU/llvm.exp10.ll | 209 +-
llvm/test/CodeGen/AMDGPU/llvm.exp2.ll | 41 +-
llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll | 120 +-
llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll | 536 +-
llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll | 42 +-
.../CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll | 30 +-
.../CodeGen/AMDGPU/llvm.is.fpclass.f16.ll | 32 +-
llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll | 90 +-
llvm/test/CodeGen/AMDGPU/llvm.log.ll | 124 +-
llvm/test/CodeGen/AMDGPU/llvm.log10.ll | 124 +-
llvm/test/CodeGen/AMDGPU/llvm.log2.ll | 156 +-
llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll | 524 +-
llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll | 520 +-
llvm/test/CodeGen/AMDGPU/llvm.mulo.ll | 424 +-
.../AMDGPU/llvm.r600.read.local.size.ll | 38 +-
llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll | 100 +-
llvm/test/CodeGen/AMDGPU/llvm.round.ll | 426 +-
llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll | 56 +-
llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll | 80 +-
llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll | 80 +-
llvm/test/CodeGen/AMDGPU/load-constant-f64.ll | 20 +-
llvm/test/CodeGen/AMDGPU/load-constant-i1.ll | 3494 +++++------
llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 1585 ++---
llvm/test/CodeGen/AMDGPU/load-constant-i32.ll | 413 +-
llvm/test/CodeGen/AMDGPU/load-constant-i64.ll | 88 +-
llvm/test/CodeGen/AMDGPU/load-constant-i8.ll | 2533 ++++----
llvm/test/CodeGen/AMDGPU/load-global-i16.ll | 582 +-
llvm/test/CodeGen/AMDGPU/load-global-i32.ll | 448 +-
.../test/CodeGen/AMDGPU/loop-prefetch-data.ll | 78 +-
.../AMDGPU/lower-lds-struct-aa-memcpy.ll | 8 +-
.../CodeGen/AMDGPU/lower-lds-struct-aa.ll | 9 +-
llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll | 6 +-
llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll | 140 +-
llvm/test/CodeGen/AMDGPU/mad.u16.ll | 48 +-
llvm/test/CodeGen/AMDGPU/mad_64_32.ll | 22 +-
llvm/test/CodeGen/AMDGPU/madak.ll | 76 +-
.../CodeGen/AMDGPU/max-hard-clause-length.ll | 420 +-
llvm/test/CodeGen/AMDGPU/max.i16.ll | 48 +-
llvm/test/CodeGen/AMDGPU/memory_clause.ll | 25 +-
llvm/test/CodeGen/AMDGPU/min.ll | 132 +-
.../CodeGen/AMDGPU/move-to-valu-addsubu64.ll | 34 +-
.../move-to-valu-pseudo-scalar-trans.ll | 60 +-
llvm/test/CodeGen/AMDGPU/mul.ll | 1510 ++---
llvm/test/CodeGen/AMDGPU/mul_int24.ll | 94 +-
llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll | 146 +-
llvm/test/CodeGen/AMDGPU/offset-split-flat.ll | 890 +--
.../CodeGen/AMDGPU/offset-split-global.ll | 722 +--
llvm/test/CodeGen/AMDGPU/omod.ll | 88 +-
llvm/test/CodeGen/AMDGPU/optimize-compare.ll | 8 +-
llvm/test/CodeGen/AMDGPU/or.ll | 270 +-
llvm/test/CodeGen/AMDGPU/packed-op-sel.ll | 64 +-
.../AMDGPU/post-ra-soft-clause-dbg-info.ll | 8 +-
llvm/test/CodeGen/AMDGPU/preload-kernargs.ll | 12 +-
.../test/CodeGen/AMDGPU/promote-vect3-load.ll | 4 +-
.../AMDGPU/ptr-buffer-alias-scheduling.ll | 52 +-
llvm/test/CodeGen/AMDGPU/rcp-pattern.ll | 122 +-
llvm/test/CodeGen/AMDGPU/rotl.ll | 52 +-
llvm/test/CodeGen/AMDGPU/rotr.ll | 20 +-
llvm/test/CodeGen/AMDGPU/saddo.ll | 258 +-
llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll | 52 +-
llvm/test/CodeGen/AMDGPU/sdiv.ll | 278 +-
llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll | 370 +-
llvm/test/CodeGen/AMDGPU/select.f16.ll | 630 +-
llvm/test/CodeGen/AMDGPU/shl.ll | 386 +-
llvm/test/CodeGen/AMDGPU/shl.v2i16.ll | 140 +-
.../CodeGen/AMDGPU/shrink-add-sub-constant.ll | 1364 ++---
llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll | 8 +-
llvm/test/CodeGen/AMDGPU/sign_extend.ll | 184 +-
.../CodeGen/AMDGPU/simple-indirect-call.ll | 20 +-
llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll | 354 +-
llvm/test/CodeGen/AMDGPU/sitofp.f16.ll | 182 +-
.../CodeGen/AMDGPU/spill-scavenge-offset.ll | 5170 ++++++++---------
llvm/test/CodeGen/AMDGPU/sra.ll | 274 +-
llvm/test/CodeGen/AMDGPU/srl.ll | 70 +-
llvm/test/CodeGen/AMDGPU/sub.ll | 236 +-
llvm/test/CodeGen/AMDGPU/sub.v2i16.ll | 304 +-
llvm/test/CodeGen/AMDGPU/trap-abis.ll | 22 +-
llvm/test/CodeGen/AMDGPU/trunc-combine.ll | 14 +-
llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll | 6 +-
llvm/test/CodeGen/AMDGPU/uaddo.ll | 178 +-
llvm/test/CodeGen/AMDGPU/udiv.ll | 230 +-
llvm/test/CodeGen/AMDGPU/udivrem.ll | 146 +-
llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll | 284 +-
llvm/test/CodeGen/AMDGPU/uitofp.f16.ll | 182 +-
llvm/test/CodeGen/AMDGPU/uniform-cfg.ll | 150 +-
llvm/test/CodeGen/AMDGPU/usubo.ll | 178 +-
.../CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll | 8 +-
llvm/test/CodeGen/AMDGPU/v_cndmask.ll | 250 +-
llvm/test/CodeGen/AMDGPU/v_madak_f16.ll | 56 +-
llvm/test/CodeGen/AMDGPU/v_pack.ll | 82 +-
llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll | 90 +-
.../CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll | 8 +-
llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll | 10 +-
.../test/CodeGen/AMDGPU/vni8-across-blocks.ll | 10 +-
llvm/test/CodeGen/AMDGPU/wave32.ll | 282 +-
llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll | 90 +-
llvm/test/CodeGen/AMDGPU/xor.ll | 174 +-
llvm/test/CodeGen/AMDGPU/zero_extend.ll | 2 +-
265 files changed, 43687 insertions(+), 43514 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 4551a3a615b15..9fbedce554a53 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -867,13 +867,104 @@ def SMRDBufferImm : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm">;
def SMRDBufferImm32 : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm32">;
def SMRDBufferSgprImm : ComplexPattern<iPTR, 2, "SelectSMRDBufferSgprImm">;
+class SMRDAlignedLoadPat<PatFrag Op> : PatFrag <(ops node:$ptr), (Op node:$ptr), [{
+ // Returns true if it is a naturally aligned multi-dword load.
+ LoadSDNode *Ld = cast<LoadSDNode>(N);
+ unsigned Size = Ld->getMemoryVT().getStoreSize();
+ return (Size <= 4) || (Ld->getAlign().value() >= PowerOf2Ceil(Size));
+}]> {
+ let GISelPredicateCode = [{
+ auto &Ld = cast<GLoad>(MI);
+ TypeSize Size = Ld.getMMO().getSize().getValue();
+ return (Size <= 4) || (Ld.getMMO().getAlign().value() >= PowerOf2Ceil(Size));
+ }];
+}
+
+class SMRDUnalignedLoadPat<PatFrag Op> : PatFrag <(ops node:$ptr), (Op node:$ptr), [{
+ // Returns true if it is an under aligned multi-dword load.
+ LoadSDNode *Ld = cast<LoadSDNode>(N);
+ unsigned Size = Ld->getMemoryVT().getStoreSize();
+ return (Size > 4) && (Ld->getAlign().value() < PowerOf2Ceil(Size));
+}]> {
+ let GISelPredicateCode = [{
+ auto &Ld = cast<GLoad>(MI);
+ TypeSize Size = Ld.getMMO().getSize().getValue();
+ return (Size > 4) && (Ld.getMMO().getAlign().value() < PowerOf2Ceil(Size));
+ }];
+}
+
+def alignedmultidwordload : SMRDAlignedLoadPat<smrd_load>;
+def unalignedmultidwordload : SMRDUnalignedLoadPat<smrd_load>;
+
+multiclass SMRD_Align_Pattern <string Instr, ValueType vt> {
+
+ // 1. IMM offset
+ def : GCNPat <
+ (alignedmultidwordload (SMRDImm i64:$sbase, i32:$offset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0))> {
+ let OtherPredicates = [isGFX8Plus];
+ }
+ def : GCNPat <
+ (unalignedmultidwordload (SMRDImm i64:$sbase, i32:$offset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM_ec") $sbase, $offset, 0))> {
+ let OtherPredicates = [isGFX8Plus];
+ }
+
+ // 2. SGPR offset
+ def : GCNPat <
+ (alignedmultidwordload (SMRDSgpr i64:$sbase, i32:$soffset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $soffset, 0))> {
+ let OtherPredicates = [isGFX8Only];
+ }
+ def : GCNPat <
+ (unalignedmultidwordload (SMRDSgpr i64:$sbase, i32:$soffset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_ec") $sbase, $soffset, 0))> {
+ let OtherPredicates = [isGFX8Only];
+ }
+ def : GCNPat <
+ (alignedmultidwordload (SMRDSgpr i64:$sbase, i32:$soffset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, 0, 0))> {
+ let OtherPredicates = [isGFX9Plus];
+ }
+ def : GCNPat <
+ (unalignedmultidwordload (SMRDSgpr i64:$sbase, i32:$soffset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM_ec") $sbase, $soffset, 0, 0))> {
+ let OtherPredicates = [isGFX9Plus];
+ }
+
+ // 3. SGPR+IMM offset
+ def : GCNPat <
+ (alignedmultidwordload (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, 0))> {
+ let OtherPredicates = [isGFX9Plus];
+ }
+ def : GCNPat <
+ (unalignedmultidwordload (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM_ec") $sbase, $soffset, $offset, 0))> {
+ let OtherPredicates = [isGFX9Plus];
+ }
+
+ // 4. No offset
+ def : GCNPat <
+ (vt (alignedmultidwordload (i64 SReg_64:$sbase))),
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM") i64:$sbase, 0, 0))> {
+ let OtherPredicates = [isGFX8Plus];
+ }
+ def : GCNPat <
+ (vt (unalignedmultidwordload (i64 SReg_64:$sbase))),
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM_ec") i64:$sbase, 0, 0))> {
+ let OtherPredicates = [isGFX8Plus];
+ }
+}
+
multiclass SMRD_Pattern <string Instr, ValueType vt, bit immci = true> {
// 1. IMM offset
def : GCNPat <
(smrd_load (SMRDImm i64:$sbase, i32:$offset)),
- (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0))
- >;
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0))> {
+ let OtherPredicates = [isGFX6GFX7];
+ }
// 2. 32-bit IMM offset on CI
if immci then def : GCNPat <
@@ -886,26 +977,17 @@ multiclass SMRD_Pattern <string Instr, ValueType vt, bit immci = true> {
def : GCNPat <
(smrd_load (SMRDSgpr i64:$sbase, i32:$soffset)),
(vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $soffset, 0))> {
- let OtherPredicates = [isNotGFX9Plus];
- }
- def : GCNPat <
- (smrd_load (SMRDSgpr i64:$sbase, i32:$soffset)),
- (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, 0, 0))> {
- let OtherPredicates = [isGFX9Plus];
+ let OtherPredicates = [isGFX6GFX7];
}
- // 4. SGPR+IMM offset
+ // 4. No offset
def : GCNPat <
- (smrd_load (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)),
- (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, 0))> {
- let OtherPredicates = [isGFX9Plus];
+ (vt (smrd_load (i64 SReg_64:$sbase))),
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM") i64:$sbase, 0, 0))> {
+ let OtherPredicates = [isGFX6GFX7];
}
- // 5. No offset
- def : GCNPat <
- (vt (smrd_load (i64 SReg_64:$sbase))),
- (vt (!cast<SM_Pseudo>(Instr#"_IMM") i64:$sbase, 0, 0))
- >;
+ defm : SMRD_Align_Pattern<Instr, vt>;
}
multiclass SMLoad_Pattern <string Instr, ValueType vt, bit immci = true> {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
index a38b6e3263882..9a8672dba5357 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
@@ -7,11 +7,11 @@ define amdgpu_kernel void @s_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s0, s6, s0
-; GFX11-NEXT: s_addc_u32 s1, s7, s1
+; GFX11-NEXT: s_add_u32 s0, s6, s2
+; GFX11-NEXT: s_addc_u32 s1, s7, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
@@ -23,10 +23,10 @@ define amdgpu_kernel void @s_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[2:3]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
@@ -59,11 +59,11 @@ define amdgpu_kernel void @s_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sub_u32 s0, s6, s0
-; GFX11-NEXT: s_subb_u32 s1, s7, s1
+; GFX11-NEXT: s_sub_u32 s0, s6, s2
+; GFX11-NEXT: s_subb_u32 s1, s7, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
@@ -75,10 +75,10 @@ define amdgpu_kernel void @s_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_sub_nc_u64 s[0:1], s[6:7], s[0:1]
+; GFX12-NEXT: s_sub_nc_u64 s[0:1], s[6:7], s[2:3]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll
index bb5ccc3657dc4..57a8bbbb7d185 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll
@@ -113,9 +113,9 @@ bb1:
define amdgpu_kernel void @brcond_sgpr_trunc_and(i32 %cond0, i32 %cond1) {
; WAVE64-LABEL: brcond_sgpr_trunc_and:
; WAVE64: ; %bb.0: ; %entry
-; WAVE64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; WAVE64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; WAVE64-NEXT: s_waitcnt lgkmcnt(0)
-; WAVE64-NEXT: s_and_b32 s0, s0, s1
+; WAVE64-NEXT: s_and_b32 s0, s2, s3
; WAVE64-NEXT: s_xor_b32 s0, s0, 1
; WAVE64-NEXT: s_and_b32 s0, s0, 1
; WAVE64-NEXT: s_cmp_lg_u32 s0, 0
@@ -131,9 +131,9 @@ define amdgpu_kernel void @brcond_sgpr_trunc_and(i32 %cond0, i32 %cond1) {
;
; WAVE32-LABEL: brcond_sgpr_trunc_and:
; WAVE32: ; %bb.0: ; %entry
-; WAVE32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; WAVE32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; WAVE32-NEXT: s_waitcnt lgkmcnt(0)
-; WAVE32-NEXT: s_and_b32 s0, s0, s1
+; WAVE32-NEXT: s_and_b32 s0, s2, s3
; WAVE32-NEXT: s_xor_b32 s0, s0, 1
; WAVE32-NEXT: s_and_b32 s0, s0, 1
; WAVE32-NEXT: s_cmp_lg_u32 s0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
index 3f034eaca4997..9cabe0c0ae9de 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
@@ -1400,11 +1400,11 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr
;
; VI-LABEL: cvt_ubyte0_or_multiuse:
; VI: ; %bb.0: ; %bb
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
@@ -1412,8 +1412,8 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr
; VI-NEXT: v_or_b32_e32 v0, 0x80000001, v0
; VI-NEXT: v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
; VI-NEXT: v_add_f32_e32 v2, v0, v1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
bb:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
index a018ea5bf18f1..ce0d9c3c5365e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
@@ -27,10 +27,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) {
define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat:
; GFX940: ; %bb.0:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -43,10 +43,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee:
; GFX940: ; %bb.0:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index 4e94a646f6da5..081e25708c067 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -1021,20 +1021,20 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret(ptr addrspace(1) %ptr, double %data) {
; GFX90A-LABEL: global_atomic_fadd_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fadd_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX940-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1044,20 +1044,20 @@ main_body:
define amdgpu_kernel void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, double %data) {
; GFX90A-LABEL: global_atomic_fmin_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT: global_atomic_min_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: global_atomic_min_f64 v2, v[0:1], s[4:5]
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fmin_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NEXT: global_atomic_min_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
+; GFX940-NEXT: global_atomic_min_f64 v2, v[0:1], s[4:5]
; GFX940-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1067,20 +1067,20 @@ main_body:
define amdgpu_kernel void @global_atomic_fmax_f64_noret(ptr addrspace(1) %ptr, double %data) {
; GFX90A-LABEL: global_atomic_fmax_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
-; GFX90A-NEXT: global_atomic_max_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
+; GFX90A-NEXT: global_atomic_max_f64 v2, v[0:1], s[4:5]
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fmax_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX940-NEXT: global_atomic_max_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
+; GFX940-NEXT: global_atomic_max_f64 v2, v[0:1], s[4:5]
; GFX940-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1090,21 +1090,21 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %ptr) #1 {
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_mov_b64 s[2:3], exec
-; GFX90A-NEXT: s_mov_b32 s4, s3
-; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], exec
+; GFX90A-NEXT: s_mov_b32 s2, s5
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX90A-NEXT: s_cbranch_execz .LBB39_3
; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
-; GFX90A-NEXT: s_mov_b64 s[2:3], 0
+; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
@@ -1112,14 +1112,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX90A-NEXT: s_cbranch_execnz .LBB39_2
; GFX90A-NEXT: .LBB39_3:
; GFX90A-NEXT: s_endpgm
@@ -1134,14 +1134,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX940-NEXT: s_cbranch_execz .LBB39_2
; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: .LBB39_2:
@@ -1162,13 +1162,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_cbranch_execz .LBB40_2
; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: .LBB40_2:
@@ -1184,14 +1184,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX940-NEXT: s_cbranch_execz .LBB40_2
; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: .LBB40_2:
@@ -1204,21 +1204,21 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace(1) %ptr) #1 {
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_system:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_mov_b64 s[2:3], exec
-; GFX90A-NEXT: s_mov_b32 s4, s3
-; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], exec
+; GFX90A-NEXT: s_mov_b32 s2, s5
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX90A-NEXT: s_cbranch_execz .LBB41_3
; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
-; GFX90A-NEXT: s_mov_b64 s[2:3], 0
+; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
@@ -1226,14 +1226,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX90A-NEXT: s_cbranch_execnz .LBB41_2
; GFX90A-NEXT: .LBB41_3:
; GFX90A-NEXT: s_endpgm
@@ -1248,14 +1248,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX940-NEXT: s_cbranch_execz .LBB41_2
; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: .LBB41_2:
@@ -1276,13 +1276,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_cbranch_execz .LBB42_2
; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: .LBB42_2:
@@ -1298,14 +1298,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX940-NEXT: s_cbranch_execz .LBB42_2
; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: .LBB42_2:
@@ -1480,34 +1480,34 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrspace(1) %ptr) {
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_mov_b64 s[2:3], exec
-; GFX90A-NEXT: s_mov_b32 s4, s3
-; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], exec
+; GFX90A-NEXT: s_mov_b32 s2, s5
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s2, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX90A-NEXT: s_cbranch_execz .LBB49_3
; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
-; GFX90A-NEXT: s_mov_b64 s[2:3], 0
+; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: .LBB49_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX90A-NEXT: s_cbranch_execnz .LBB49_2
; GFX90A-NEXT: .LBB49_3:
; GFX90A-NEXT: s_endpgm
@@ -1522,14 +1522,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX940-NEXT: s_cbranch_execz .LBB49_2
; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: .LBB49_2:
@@ -1542,11 +1542,11 @@ main_body:
define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
-; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1566,10 +1566,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
;
; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1583,11 +1583,11 @@ main_body:
define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
@@ -1595,10 +1595,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
;
; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1612,11 +1612,11 @@ main_body:
define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_system:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
-; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1637,10 +1637,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
;
; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_system:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -1761,19 +1761,19 @@ main_body:
define amdgpu_kernel void @flat_atomic_fadd_f64_noret(ptr %ptr, double %data) {
; GFX90A-LABEL: flat_atomic_fadd_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: flat_atomic_fadd_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
; GFX940-NEXT: s_endpgm
main_body:
@@ -1803,11 +1803,11 @@ main_body:
define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
-; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1825,10 +1825,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
;
; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1842,19 +1842,19 @@ main_body:
define amdgpu_kernel void @flat_atomic_fmin_f64_noret(ptr %ptr, double %data) {
; GFX90A-LABEL: flat_atomic_fmin_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3]
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: flat_atomic_fmin_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3]
; GFX940-NEXT: s_endpgm
main_body:
@@ -1884,19 +1884,19 @@ main_body:
define amdgpu_kernel void @flat_atomic_fmax_f64_noret(ptr %ptr, double %data) {
; GFX90A-LABEL: flat_atomic_fmax_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3]
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: flat_atomic_fmax_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3]
; GFX940-NEXT: s_endpgm
main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
index 05cdb54f5dd74..4635db9d78c3d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
@@ -38,20 +38,20 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; VI-LABEL: frem_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[6:7], 0x0
-; VI-NEXT: s_load_dword s0, s[0:1], 0x8
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_load_dword s1, s[2:3], 0x8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; VI-NEXT: v_cvt_f32_f16_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_cvt_f32_f16_e32 v0, s0
+; VI-NEXT: v_cvt_f32_f16_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_rcp_f32_e32 v2, v2
; VI-NEXT: v_mul_f32_e32 v0, v0, v2
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2
+; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s0
; VI-NEXT: v_trunc_f16_e32 v0, v0
-; VI-NEXT: v_fma_f16 v2, -v0, v1, s2
+; VI-NEXT: v_fma_f16 v2, -v0, v1, s0
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_short v[0:1], v2
@@ -88,16 +88,16 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
; VI-LABEL: fast_frem_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[6:7], 0x0
-; VI-NEXT: s_load_dword s0, s[0:1], 0x8
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_load_dword s1, s[2:3], 0x8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: v_rcp_f16_e32 v0, s0
-; VI-NEXT: v_mul_f16_e32 v0, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_rcp_f16_e32 v0, s1
+; VI-NEXT: v_mul_f16_e32 v0, s0, v0
; VI-NEXT: v_trunc_f16_e32 v0, v0
-; VI-NEXT: v_fma_f16 v2, -v0, s0, v1
+; VI-NEXT: v_fma_f16 v2, -v0, s1, v1
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_short v[0:1], v2
@@ -134,16 +134,16 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
; VI-LABEL: unsafe_frem_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[6:7], 0x0
-; VI-NEXT: s_load_dword s0, s[0:1], 0x8
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_load_dword s1, s[2:3], 0x8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: v_rcp_f16_e32 v0, s0
-; VI-NEXT: v_mul_f16_e32 v0, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_rcp_f16_e32 v0, s1
+; VI-NEXT: v_mul_f16_e32 v0, s0, v0
; VI-NEXT: v_trunc_f16_e32 v0, v0
-; VI-NEXT: v_fma_f16 v2, -v0, s0, v1
+; VI-NEXT: v_fma_f16 v2, -v0, s1, v1
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_short v[0:1], v2
@@ -189,14 +189,14 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
; VI-LABEL: frem_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[6:7], 0x0
-; VI-NEXT: s_load_dword s0, s[0:1], 0x10
+; VI-NEXT: s_load_dword s6, s[6:7], 0x0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x10
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, s2
-; VI-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2
+; VI-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, s6
+; VI-NEXT: v_div_scale_f32 v2, vcc, s6, v0, s6
; VI-NEXT: v_rcp_f32_e32 v3, v1
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
@@ -207,9 +207,9 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
; VI-NEXT: v_fma_f32 v1, -v1, v4, v2
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
-; VI-NEXT: v_div_fixup_f32 v1, v1, v0, s2
+; VI-NEXT: v_div_fixup_f32 v1, v1, v0, s6
; VI-NEXT: v_trunc_f32_e32 v1, v1
-; VI-NEXT: v_fma_f32 v2, -v1, v0, s2
+; VI-NEXT: v_fma_f32 v2, -v1, v0, s6
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -244,16 +244,16 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1)
; VI-LABEL: fast_frem_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[6:7], 0x0
-; VI-NEXT: s_load_dword s0, s[0:1], 0x10
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_load_dword s1, s[2:3], 0x10
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: v_rcp_f32_e32 v0, s0
-; VI-NEXT: v_mul_f32_e32 v0, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_rcp_f32_e32 v0, s1
+; VI-NEXT: v_mul_f32_e32 v0, s0, v0
; VI-NEXT: v_trunc_f32_e32 v0, v0
-; VI-NEXT: v_fma_f32 v2, -v0, s0, v1
+; VI-NEXT: v_fma_f32 v2, -v0, s1, v1
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -288,16 +288,16 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
; VI-LABEL: unsafe_frem_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[6:7], 0x0
-; VI-NEXT: s_load_dword s0, s[0:1], 0x10
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_load_dword s1, s[2:3], 0x10
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s2
-; VI-NEXT: v_rcp_f32_e32 v0, s0
-; VI-NEXT: v_mul_f32_e32 v0, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_rcp_f32_e32 v0, s1
+; VI-NEXT: v_mul_f32_e32 v0, s0, v0
; VI-NEXT: v_trunc_f32_e32 v0, v0
-; VI-NEXT: v_fma_f32 v2, -v0, s0, v1
+; VI-NEXT: v_fma_f32 v2, -v0, s1, v1
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -342,15 +342,15 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
; VI-LABEL: frem_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], s[2:3]
-; VI-NEXT: v_div_scale_f64 v[8:9], vcc, s[2:3], v[0:1], s[2:3]
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], s[0:1]
+; VI-NEXT: v_div_scale_f64 v[8:9], vcc, s[0:1], v[0:1], s[0:1]
; VI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
@@ -359,9 +359,9 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
; VI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5]
; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
; VI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
-; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[2:3]
+; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[0:1]
; VI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3]
-; VI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[2:3]
+; VI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[0:1]
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -402,23 +402,23 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
; VI-LABEL: fast_frem_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rcp_f64_e32 v[0:1], s[0:1]
-; VI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0
+; VI-NEXT: v_rcp_f64_e32 v[0:1], s[2:3]
+; VI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
-; VI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0
+; VI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_mul_f64 v[4:5], s[2:3], v[0:1]
-; VI-NEXT: v_fma_f64 v[6:7], -s[0:1], v[4:5], v[2:3]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mul_f64 v[4:5], s[0:1], v[0:1]
+; VI-NEXT: v_fma_f64 v[6:7], -s[2:3], v[4:5], v[2:3]
; VI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
; VI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
-; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[0:1], v[2:3]
+; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[2:3], v[2:3]
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -459,23 +459,23 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
; VI-LABEL: unsafe_frem_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rcp_f64_e32 v[0:1], s[0:1]
-; VI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0
+; VI-NEXT: v_rcp_f64_e32 v[0:1], s[2:3]
+; VI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
-; VI-NEXT: v_fma_f64 v[2:3], -s[0:1], v[0:1], 1.0
+; VI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0
; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_mul_f64 v[4:5], s[2:3], v[0:1]
-; VI-NEXT: v_fma_f64 v[6:7], -s[0:1], v[4:5], v[2:3]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mul_f64 v[4:5], s[0:1], v[0:1]
+; VI-NEXT: v_fma_f64 v[6:7], -s[2:3], v[4:5], v[2:3]
; VI-NEXT: v_fma_f64 v[0:1], v[6:7], v[0:1], v[4:5]
; VI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
-; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[0:1], v[2:3]
+; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[2:3], v[2:3]
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -546,31 +546,31 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-LABEL: frem_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[6:7], 0x0
-; VI-NEXT: s_load_dword s0, s[0:1], 0x10
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_load_dword s1, s[2:3], 0x10
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; VI-NEXT: v_cvt_f32_f16_e32 v2, s0
-; VI-NEXT: s_lshr_b32 s3, s0, 16
+; VI-NEXT: v_cvt_f32_f16_e32 v0, s0
+; VI-NEXT: v_cvt_f32_f16_e32 v2, s1
+; VI-NEXT: s_lshr_b32 s3, s1, 16
; VI-NEXT: v_cvt_f32_f16_e32 v3, s3
-; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_rcp_f32_e32 v2, v2
-; VI-NEXT: s_lshr_b32 s1, s2, 16
+; VI-NEXT: s_lshr_b32 s2, s0, 16
; VI-NEXT: v_rcp_f32_e32 v3, v3
; VI-NEXT: v_mul_f32_e32 v0, v0, v2
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-NEXT: v_mov_b32_e32 v2, s3
-; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2
+; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s0
; VI-NEXT: v_trunc_f16_e32 v0, v0
-; VI-NEXT: v_fma_f16 v0, -v0, v1, s2
-; VI-NEXT: v_cvt_f32_f16_e32 v1, s1
+; VI-NEXT: v_fma_f16 v0, -v0, v1, s0
+; VI-NEXT: v_cvt_f32_f16_e32 v1, s2
; VI-NEXT: v_mul_f32_e32 v1, v1, v3
; VI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; VI-NEXT: v_div_fixup_f16 v1, v1, v2, s1
+; VI-NEXT: v_div_fixup_f16 v1, v1, v2, s2
; VI-NEXT: v_trunc_f16_e32 v1, v1
-; VI-NEXT: v_fma_f16 v1, -v1, v2, s1
+; VI-NEXT: v_fma_f16 v1, -v1, v2, s2
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v2, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s4
@@ -683,47 +683,47 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-LABEL: frem_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x20
+; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x20
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; VI-NEXT: v_cvt_f32_f16_e32 v2, s0
-; VI-NEXT: s_lshr_b32 s8, s0, 16
+; VI-NEXT: v_cvt_f32_f16_e32 v0, s0
+; VI-NEXT: v_cvt_f32_f16_e32 v2, s2
+; VI-NEXT: s_lshr_b32 s8, s2, 16
; VI-NEXT: v_cvt_f32_f16_e32 v3, s8
-; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_rcp_f32_e32 v2, v2
-; VI-NEXT: s_lshr_b32 s6, s2, 16
+; VI-NEXT: s_lshr_b32 s6, s0, 16
; VI-NEXT: v_rcp_f32_e32 v3, v3
-; VI-NEXT: v_cvt_f32_f16_e32 v4, s1
+; VI-NEXT: v_cvt_f32_f16_e32 v4, s3
; VI-NEXT: v_mul_f32_e32 v0, v0, v2
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-NEXT: v_mov_b32_e32 v2, s8
; VI-NEXT: v_rcp_f32_e32 v4, v4
-; VI-NEXT: s_lshr_b32 s9, s1, 16
-; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2
+; VI-NEXT: s_lshr_b32 s9, s3, 16
+; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s0
; VI-NEXT: v_trunc_f16_e32 v0, v0
-; VI-NEXT: v_fma_f16 v0, -v0, v1, s2
+; VI-NEXT: v_fma_f16 v0, -v0, v1, s0
; VI-NEXT: v_cvt_f32_f16_e32 v1, s6
; VI-NEXT: v_cvt_f32_f16_e32 v5, s9
-; VI-NEXT: s_lshr_b32 s7, s3, 16
+; VI-NEXT: s_lshr_b32 s7, s1, 16
; VI-NEXT: v_mul_f32_e32 v1, v1, v3
; VI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_rcp_f32_e32 v5, v5
; VI-NEXT: v_div_fixup_f16 v1, v1, v2, s6
; VI-NEXT: v_trunc_f16_e32 v1, v1
; VI-NEXT: v_fma_f16 v1, -v1, v2, s6
-; VI-NEXT: v_cvt_f32_f16_e32 v2, s3
+; VI-NEXT: v_cvt_f32_f16_e32 v2, s1
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: v_mul_f32_e32 v2, v2, v4
; VI-NEXT: v_cvt_f16_f32_e32 v2, v2
; VI-NEXT: v_mov_b32_e32 v4, s9
-; VI-NEXT: v_div_fixup_f16 v2, v2, v3, s3
+; VI-NEXT: v_div_fixup_f16 v2, v2, v3, s1
; VI-NEXT: v_trunc_f16_e32 v2, v2
-; VI-NEXT: v_fma_f16 v2, -v2, v3, s3
+; VI-NEXT: v_fma_f16 v2, -v2, v3, s1
; VI-NEXT: v_cvt_f32_f16_e32 v3, s7
; VI-NEXT: v_mul_f32_e32 v3, v3, v5
; VI-NEXT: v_cvt_f16_f32_e32 v3, v3
@@ -793,14 +793,14 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-LABEL: frem_v2f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x0
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x20
+; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x20
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_div_scale_f32 v1, s[6:7], v0, v0, s2
-; VI-NEXT: v_div_scale_f32 v2, vcc, s2, v0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_div_scale_f32 v1, s[6:7], v0, v0, s0
+; VI-NEXT: v_div_scale_f32 v2, vcc, s0, v0, s0
; VI-NEXT: v_rcp_f32_e32 v3, v1
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; VI-NEXT: v_fma_f32 v4, -v1, v3, 1.0
@@ -811,12 +811,12 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_fma_f32 v1, -v1, v4, v2
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; VI-NEXT: v_div_fmas_f32 v1, v1, v3, v4
-; VI-NEXT: v_div_fixup_f32 v1, v1, v0, s2
+; VI-NEXT: v_div_fixup_f32 v1, v1, v0, s0
; VI-NEXT: v_trunc_f32_e32 v1, v1
-; VI-NEXT: v_fma_f32 v0, -v1, v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, s3
-; VI-NEXT: v_div_scale_f32 v3, vcc, s3, v1, s3
+; VI-NEXT: v_fma_f32 v0, -v1, v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, s1
+; VI-NEXT: v_div_scale_f32 v3, vcc, s1, v1, s1
; VI-NEXT: v_rcp_f32_e32 v4, v2
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; VI-NEXT: v_fma_f32 v5, -v2, v4, 1.0
@@ -827,9 +827,9 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_fma_f32 v2, -v2, v5, v3
; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; VI-NEXT: v_div_fmas_f32 v2, v2, v4, v5
-; VI-NEXT: v_div_fixup_f32 v2, v2, v1, s3
+; VI-NEXT: v_div_fixup_f32 v2, v2, v1, s1
; VI-NEXT: v_trunc_f32_e32 v2, v2
-; VI-NEXT: v_fma_f32 v1, -v2, v1, s3
+; VI-NEXT: v_fma_f32 v1, -v2, v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
index 5d4816812e6c0..83a85c7e845dd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
@@ -16,9 +16,9 @@ body: |
; CHECK: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4)
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3
+ ; CHECK-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4)
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY %3.sub0_sub1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY %3.sub2_sub3
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1)
; CHECK-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]]
@@ -64,9 +64,9 @@ body: |
; CHECK: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4)
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3
+ ; CHECK-NEXT: early-clobber %3:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4)
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY %3.sub0_sub1
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY %3.sub2_sub3
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1)
; CHECK-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir
index c44477273dad0..7bbce45d78387 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir
@@ -167,15 +167,15 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<2 x s32>), align 4, addrspace 4)
- ; GFX8-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
+ ; GFX8-NEXT: early-clobber %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]], 0, 0 :: (load (<2 x s32>), align 4, addrspace 4)
+ ; GFX8-NEXT: $sgpr0_sgpr1 = COPY %1
;
; GFX10-LABEL: name: load_constant_v2s32_align4
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<2 x s32>), align 4, addrspace 4)
- ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
+ ; GFX10-NEXT: early-clobber %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]], 0, 0 :: (load (<2 x s32>), align 4, addrspace 4)
+ ; GFX10-NEXT: $sgpr0_sgpr1 = COPY %1
%0:sgpr(p4) = COPY $sgpr0_sgpr1
%1:sgpr(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 4, addrspace 4)
$sgpr0_sgpr1 = COPY %1
@@ -210,15 +210,15 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<4 x s16>), align 4, addrspace 4)
- ; GFX8-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
+ ; GFX8-NEXT: early-clobber %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]], 0, 0 :: (load (<4 x s16>), align 4, addrspace 4)
+ ; GFX8-NEXT: $sgpr0_sgpr1 = COPY %1
;
; GFX10-LABEL: name: load_constant_v4s16_align4
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (<4 x s16>), align 4, addrspace 4)
- ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
+ ; GFX10-NEXT: early-clobber %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]], 0, 0 :: (load (<4 x s16>), align 4, addrspace 4)
+ ; GFX10-NEXT: $sgpr0_sgpr1 = COPY %1
%0:sgpr(p4) = COPY $sgpr0_sgpr1
%1:sgpr(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 4, addrspace 4)
$sgpr0_sgpr1 = COPY %1
@@ -254,15 +254,15 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<4 x s32>), align 4, addrspace 4)
- ; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
+ ; GFX8-NEXT: early-clobber %1:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 0, 0 :: (load (<4 x s32>), align 4, addrspace 4)
+ ; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
;
; GFX10-LABEL: name: load_constant_v4s32_align4
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<4 x s32>), align 4, addrspace 4)
- ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
+ ; GFX10-NEXT: early-clobber %1:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 0, 0 :: (load (<4 x s32>), align 4, addrspace 4)
+ ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
%0:sgpr(p4) = COPY $sgpr0_sgpr1
%1:sgpr(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 4, addrspace 4)
$sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
@@ -342,15 +342,15 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (s64), align 4, addrspace 4)
- ; GFX8-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
+ ; GFX8-NEXT: early-clobber %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]], 0, 0 :: (load (s64), align 4, addrspace 4)
+ ; GFX8-NEXT: $sgpr0_sgpr1 = COPY %1
;
; GFX10-LABEL: name: load_constant_s64_align4
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 0, 0 :: (load (s64), align 4, addrspace 4)
- ; GFX10-NEXT: $sgpr0_sgpr1 = COPY [[S_LOAD_DWORDX2_IMM]]
+ ; GFX10-NEXT: early-clobber %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]], 0, 0 :: (load (s64), align 4, addrspace 4)
+ ; GFX10-NEXT: $sgpr0_sgpr1 = COPY %1
%0:sgpr(p4) = COPY $sgpr0_sgpr1
%1:sgpr(s64) = G_LOAD %0 :: (load (s64), align 4, addrspace 4)
$sgpr0_sgpr1 = COPY %1
@@ -386,15 +386,15 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<2 x s64>), align 4, addrspace 4)
- ; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
+ ; GFX8-NEXT: early-clobber %1:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 0, 0 :: (load (<2 x s64>), align 4, addrspace 4)
+ ; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
;
; GFX10-LABEL: name: load_constant_v2s64
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<2 x s64>), align 4, addrspace 4)
- ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
+ ; GFX10-NEXT: early-clobber %1:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 0, 0 :: (load (<2 x s64>), align 4, addrspace 4)
+ ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
%0:sgpr(p4) = COPY $sgpr0_sgpr1
%1:sgpr(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 4, addrspace 4)
$sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
@@ -782,15 +782,15 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<8 x s16>), align 4, addrspace 4)
- ; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
+ ; GFX8-NEXT: early-clobber %1:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 0, 0 :: (load (<8 x s16>), align 4, addrspace 4)
+ ; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
;
; GFX10-LABEL: name: load_constant_v8s16
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY]], 0, 0 :: (load (<8 x s16>), align 4, addrspace 4)
- ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[S_LOAD_DWORDX4_IMM]]
+ ; GFX10-NEXT: early-clobber %1:sgpr_128 = S_LOAD_DWORDX4_IMM_ec [[COPY]], 0, 0 :: (load (<8 x s16>), align 4, addrspace 4)
+ ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
%0:sgpr(p4) = COPY $sgpr0_sgpr1
%1:sgpr(<8 x s16>) = G_LOAD %0 :: (load (<8 x s16>), align 4, addrspace 4)
$sgpr0_sgpr1_sgpr2_sgpr3 = COPY %1
@@ -826,15 +826,15 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (load (<8 x s32>), align 4, addrspace 4)
- ; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY [[S_LOAD_DWORDX8_IMM]]
+ ; GFX8-NEXT: early-clobber %1:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[COPY]], 0, 0 :: (load (<8 x s32>), align 4, addrspace 4)
+ ; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY %1
;
; GFX10-LABEL: name: load_constant_v8s32
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM [[COPY]], 0, 0 :: (load (<8 x s32>), align 4, addrspace 4)
- ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY [[S_LOAD_DWORDX8_IMM]]
+ ; GFX10-NEXT: early-clobber %1:sgpr_256 = S_LOAD_DWORDX8_IMM_ec [[COPY]], 0, 0 :: (load (<8 x s32>), align 4, addrspace 4)
+ ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY %1
%0:sgpr(p4) = COPY $sgpr0_sgpr1
%1:sgpr(<8 x s32>) = G_LOAD %0 :: (load (<8 x s32>), align 4, addrspace 4)
$sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY %1
@@ -870,15 +870,15 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[S_LOAD_DWORDX16_IMM:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM [[COPY]], 0, 0 :: (load (<16 x s32>), align 4, addrspace 4)
- ; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY [[S_LOAD_DWORDX16_IMM]]
+ ; GFX8-NEXT: early-clobber %1:sgpr_512 = S_LOAD_DWORDX16_IMM_ec [[COPY]], 0, 0 :: (load (<16 x s32>), align 4, addrspace 4)
+ ; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %1
;
; GFX10-LABEL: name: load_constant_v16s32
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[S_LOAD_DWORDX16_IMM:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM [[COPY]], 0, 0 :: (load (<16 x s32>), align 4, addrspace 4)
- ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY [[S_LOAD_DWORDX16_IMM]]
+ ; GFX10-NEXT: early-clobber %1:sgpr_512 = S_LOAD_DWORDX16_IMM_ec [[COPY]], 0, 0 :: (load (<16 x s32>), align 4, addrspace 4)
+ ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %1
%0:sgpr(p4) = COPY $sgpr0_sgpr1
%1:sgpr(<16 x s32>) = G_LOAD %0 :: (load (<16 x s32>), align 4, addrspace 4)
$sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %1
@@ -914,15 +914,15 @@ body: |
; GFX8: liveins: $sgpr0_sgpr1
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX8-NEXT: [[S_LOAD_DWORDX16_IMM:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM [[COPY]], 0, 0 :: (load (<8 x s64>), align 4, addrspace 4)
- ; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY [[S_LOAD_DWORDX16_IMM]]
+ ; GFX8-NEXT: early-clobber %1:sgpr_512 = S_LOAD_DWORDX16_IMM_ec [[COPY]], 0, 0 :: (load (<8 x s64>), align 4, addrspace 4)
+ ; GFX8-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %1
;
; GFX10-LABEL: name: load_constant_v8s64
; GFX10: liveins: $sgpr0_sgpr1
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
- ; GFX10-NEXT: [[S_LOAD_DWORDX16_IMM:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM [[COPY]], 0, 0 :: (load (<8 x s64>), align 4, addrspace 4)
- ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY [[S_LOAD_DWORDX16_IMM]]
+ ; GFX10-NEXT: early-clobber %1:sgpr_512 = S_LOAD_DWORDX16_IMM_ec [[COPY]], 0, 0 :: (load (<8 x s64>), align 4, addrspace 4)
+ ; GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %1
%0:sgpr(p4) = COPY $sgpr0_sgpr1
%1:sgpr(<8 x s64>) = G_LOAD %0 :: (load (<8 x s64>), align 4, addrspace 4)
$sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll
index 7587aa0cad2d4..2a725eccf937c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-zero-initializer.ll
@@ -17,7 +17,7 @@ define amdgpu_kernel void @load_zeroinit_lds_global(ptr addrspace(1) %out, i1 %p
; GCN: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @lds
; GFX6: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[S_MOV_B32_1]], [[S_MOV_B32_]], implicit-def dead $scc
; GFX6: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 9, 0
- ; GFX8: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]], 36, 0
+ ; GFX8: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]], 36, 0
; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_U32_]]
; GCN: $m0 = S_MOV_B32 -1
; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
index 1a49a38158122..4671f602ff9c9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
@@ -27,11 +27,11 @@ define amdgpu_kernel void @test_div_scale_f32_1(ptr addrspace(1) %out, ptr addrs
;
; GFX8-LABEL: test_div_scale_f32_1:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0
@@ -40,38 +40,38 @@ define amdgpu_kernel void @test_div_scale_f32_1(ptr addrspace(1) %out, ptr addrs
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_load_dword v1, v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f32_1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v0, s2, v2, v2, v1
+; GFX10-NEXT: v_div_scale_f32 v0, s0, v2, v2, v1
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] offset:4 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, v1
; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -110,11 +110,11 @@ define amdgpu_kernel void @test_div_scale_f32_2(ptr addrspace(1) %out, ptr addrs
;
; GFX8-LABEL: test_div_scale_f32_2:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0
@@ -123,38 +123,38 @@ define amdgpu_kernel void @test_div_scale_f32_2(ptr addrspace(1) %out, ptr addrs
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_load_dword v1, v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, v1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f32_2:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v0, s2, v1, v2, v1
+; GFX10-NEXT: v_div_scale_f32 v0, s0, v1, v2, v1
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_2:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] offset:4 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_div_scale_f32 v0, null, v1, v0, v1
; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -195,7 +195,6 @@ define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrs
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
@@ -208,8 +207,10 @@ define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrs
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[2:3], v[2:3], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
@@ -217,30 +218,32 @@ define amdgpu_kernel void @test_div_scale_f64_1(ptr addrspace(1) %out, ptr addrs
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[2:3], v[2:3], v[0:1]
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f64_1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_b64 v[2:3], v2, s[2:3] offset:8 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[2:3], v[2:3], v[0:1]
; GFX11-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -281,7 +284,6 @@ define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrs
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
@@ -294,8 +296,10 @@ define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrs
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
@@ -303,30 +307,32 @@ define amdgpu_kernel void @test_div_scale_f64_2(ptr addrspace(1) %out, ptr addrs
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, v[0:1], v[2:3], v[0:1]
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f64_2:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_b64 v[2:3], v2, s[2:3] offset:8 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], v[2:3], v[0:1]
; GFX11-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -661,7 +667,7 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(ptr addrspace(1) %out
; GFX8-LABEL: test_div_scale_f64_scalar_num_1:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s6
@@ -672,33 +678,35 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(ptr addrspace(1) %out
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], s[0:1]
+; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], v[0:1], s[2:3]
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f64_scalar_num_1:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], v[0:1], s[0:1]
+; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], v[0:1], s[2:3]
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f64_scalar_num_1:
; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x54
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x54
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], v[0:1], s[0:1]
+; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], v[0:1], s[2:3]
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -736,7 +744,7 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(ptr addrspace(1) %out
; GFX8-LABEL: test_div_scale_f64_scalar_num_2:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s6
@@ -747,33 +755,35 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(ptr addrspace(1) %out
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[0:1], v[0:1], s[0:1]
+; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[2:3], v[0:1], s[2:3]
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f64_scalar_num_2:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[0:1], v[0:1], s[0:1]
+; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[2:3], v[0:1], s[2:3]
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f64_scalar_num_2:
; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x54
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x54
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[0:1], v[0:1], s[0:1]
+; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[2:3], v[0:1], s[2:3]
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -811,7 +821,7 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(ptr addrspace(1) %out
; GFX8-LABEL: test_div_scale_f64_scalar_den_1:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s6
@@ -822,33 +832,35 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(ptr addrspace(1) %out
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[0:1], s[0:1], v[0:1]
+; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], s[2:3], s[2:3], v[0:1]
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f64_scalar_den_1:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[0:1], s[0:1], v[0:1]
+; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, s[2:3], s[2:3], v[0:1]
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f64_scalar_den_1:
; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x54
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x54
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[0:1], s[0:1], v[0:1]
+; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[2:3], s[2:3], v[0:1]
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -886,7 +898,7 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(ptr addrspace(1) %out
; GFX8-LABEL: test_div_scale_f64_scalar_den_2:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s6
@@ -897,33 +909,35 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(ptr addrspace(1) %out
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], s[0:1], v[0:1]
+; GFX8-NEXT: v_div_scale_f64 v[0:1], s[0:1], v[0:1], s[2:3], v[0:1]
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f64_scalar_den_2:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x54
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], s[0:1], v[0:1]
+; GFX10-NEXT: v_div_scale_f64 v[0:1], s0, v[0:1], s[2:3], v[0:1]
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f64_scalar_den_2:
; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x54
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x54
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], s[0:1], v[0:1]
+; GFX11-NEXT: v_div_scale_f64 v[0:1], null, v[0:1], s[2:3], v[0:1]
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -957,12 +971,13 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(ptr addrspace(1) %out
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s2, s[0:1], 0x70
; GFX8-NEXT: s_load_dword s3, s[0:1], 0x4c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, s3
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -981,13 +996,13 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(ptr addrspace(1) %out
; GFX11-LABEL: test_div_scale_f32_all_scalar_1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x2
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c
-; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x4c
+; GFX11-NEXT: s_load_b32 s5, s[0:1], 0x70
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_div_scale_f32 v0, null, s3, s3, s2
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_div_scale_f32 v0, null, s5, s5, s4
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1015,12 +1030,13 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(ptr addrspace(1) %out
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s2, s[0:1], 0x70
; GFX8-NEXT: s_load_dword s3, s[0:1], 0x4c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s3, v0, s3
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1039,13 +1055,13 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(ptr addrspace(1) %out
; GFX11-LABEL: test_div_scale_f32_all_scalar_2:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x2
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c
-; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x4c
+; GFX11-NEXT: s_load_b32 s5, s[0:1], 0x70
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_div_scale_f32 v0, null, s2, s3, s2
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_div_scale_f32 v0, null, s4, s5, s4
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1074,13 +1090,14 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(ptr addrspace(1) %out
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x74
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
@@ -1090,22 +1107,24 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(ptr addrspace(1) %out
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74
; GFX10-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[4:5], s[4:5], s[2:3]
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f64_all_scalar_1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x74
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[4:5], s[4:5], s[2:3]
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1134,13 +1153,14 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(ptr addrspace(1) %out
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x74
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[4:5], v[0:1], s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
@@ -1150,22 +1170,24 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(ptr addrspace(1) %out
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74
; GFX10-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[2:3], s[4:5], s[2:3]
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f64_all_scalar_2:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x2
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x74
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[2:3], s[4:5], s[2:3]
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1195,42 +1217,42 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(ptr addrspace(1) %o
;
; GFX8-LABEL: test_div_scale_f32_inline_imm_num:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, 1.0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, 1.0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f32_inline_imm_num:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, 1.0
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: v_div_scale_f32 v0, s0, v0, v0, 1.0
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_inline_imm_num:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, 1.0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1264,42 +1286,42 @@ define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(ptr addrspace(1) %o
;
; GFX8-LABEL: test_div_scale_f32_inline_imm_den:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], 2.0, 2.0, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], 2.0, 2.0, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f32_inline_imm_den:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v0, s2, 2.0, 2.0, v0
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: v_div_scale_f32 v0, s0, 2.0, 2.0, v0
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_inline_imm_den:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_div_scale_f32 v0, null, 2.0, 2.0, v0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1336,11 +1358,11 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(ptr addrspace(1) %out, pt
;
; GFX8-LABEL: test_div_scale_f32_fabs_num:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1] glc
@@ -1350,41 +1372,41 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_num(ptr addrspace(1) %out, pt
; GFX8-NEXT: flat_load_dword v0, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v1, 0x7fffffff, v2
-; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, v1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f32_fabs_num:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v1
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_div_scale_f32 v0, s2, v2, v2, v0
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: v_div_scale_f32 v0, s0, v2, v2, v0
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_fabs_num:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] offset:4 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, v1
; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1426,11 +1448,11 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(ptr addrspace(1) %out, pt
;
; GFX8-LABEL: test_div_scale_f32_fabs_den:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0
@@ -1440,41 +1462,41 @@ define amdgpu_kernel void @test_div_scale_f32_fabs_den(ptr addrspace(1) %out, pt
; GFX8-NEXT: flat_load_dword v1, v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
-; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f32_fabs_den:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v0, 0x7fffffff, v2
-; GFX10-NEXT: v_div_scale_f32 v0, s2, v0, v0, v1
+; GFX10-NEXT: v_div_scale_f32 v0, s0, v0, v0, v1
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_fabs_den:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] offset:4 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_div_scale_f32 v0, null, v0, v0, v1
; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1509,29 +1531,30 @@ define amdgpu_kernel void @test_div_scale_f32_val_undef_val(ptr addrspace(1) %ou
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v0, 0x41000000
; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s0, s0, v0
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f32_val_undef_val:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 null, 0
+; GFX10-NEXT: v_div_scale_f32 v0, s0, s0, s0, 0x41000000
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v0, s2, s0, s0, 0x41000000
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_val_undef_val:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: v_div_scale_f32 v0, null, s0, s0, 0x41000000
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_div_scale_f32 v0, null, s0, s0, 0x41000000
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1557,29 +1580,30 @@ define amdgpu_kernel void @test_div_scale_f32_undef_val_val(ptr addrspace(1) %ou
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v0, 0x41000000
; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, s0
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f32_undef_val_val:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 null, 0
+; GFX10-NEXT: v_div_scale_f32 v0, s0, 0x41000000, 0x41000000, s0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v0, s2, 0x41000000, 0x41000000, s0
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_undef_val_val:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: v_div_scale_f32 v0, null, 0x41000000, 0x41000000, s0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_div_scale_f32 v0, null, 0x41000000, 0x41000000, s0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1603,29 +1627,30 @@ define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(ptr addrspace(1) %
; GFX8-LABEL: test_div_scale_f32_undef_undef_val:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], s0, s0, s0
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f32_undef_undef_val:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 null, 0
+; GFX10-NEXT: v_div_scale_f32 v0, s0, s0, s0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v0, s2, s0, s0, s0
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f32_undef_undef_val:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: v_div_scale_f32 v0, null, s0, s0, s0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_div_scale_f32 v0, null, s0, s0, s0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1653,29 +1678,29 @@ define amdgpu_kernel void @test_div_scale_f64_val_undef_val(ptr addrspace(1) %ou
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0x40200000
; GFX8-NEXT: v_div_scale_f64 v[0:1], s[2:3], s[0:1], s[0:1], v[0:1]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: test_div_scale_f64_val_undef_val:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_div_scale_f64 v[0:1], s2, s[0:1], s[0:1], 0x40200000
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_div_scale_f64_val_undef_val:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_div_scale_f64 v[0:1], null, s[0:1], s[0:1], 0x40200000
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
index 87d0d712d5bde..a4aea63bfdb9a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
@@ -843,75 +843,47 @@ define amdgpu_kernel void @image_bvh_intersect_ray_a16_nsa_reassign(ptr %p_node_
}
define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) {
-; GFX1030-LABEL: image_bvh64_intersect_ray_nsa_reassign:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_clause 0x1
-; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
-; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX1030-NEXT: v_mov_b32_e32 v3, 0
-; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0
-; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0
-; GFX1030-NEXT: v_mov_b32_e32 v6, 0x40400000
-; GFX1030-NEXT: v_mov_b32_e32 v7, 4.0
-; GFX1030-NEXT: v_mov_b32_e32 v8, 0x40a00000
-; GFX1030-NEXT: v_mov_b32_e32 v9, 0x40c00000
-; GFX1030-NEXT: v_mov_b32_e32 v10, 0x40e00000
-; GFX1030-NEXT: v_mov_b32_e32 v11, 0x41000000
-; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX1030-NEXT: flat_load_dword v2, v[0:1]
-; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c7
-; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102
-; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3]
-; GFX1030-NEXT: s_waitcnt vmcnt(0)
-; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
-; GFX1030-NEXT: s_endpgm
-;
-; GFX1013-LABEL: image_bvh64_intersect_ray_nsa_reassign:
-; GFX1013: ; %bb.0:
-; GFX1013-NEXT: s_clause 0x1
-; GFX1013-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX1013-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX1013-NEXT: v_mov_b32_e32 v3, 0
-; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0
-; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0
-; GFX1013-NEXT: v_mov_b32_e32 v6, 0x40400000
-; GFX1013-NEXT: v_mov_b32_e32 v7, 4.0
-; GFX1013-NEXT: v_mov_b32_e32 v8, 0x40a00000
-; GFX1013-NEXT: v_mov_b32_e32 v9, 0x40c00000
-; GFX1013-NEXT: v_mov_b32_e32 v10, 0x40e00000
-; GFX1013-NEXT: v_mov_b32_e32 v11, 0x41000000
-; GFX1013-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1013-NEXT: v_mov_b32_e32 v0, s2
-; GFX1013-NEXT: v_mov_b32_e32 v1, s3
-; GFX1013-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX1013-NEXT: flat_load_dword v2, v[0:1]
-; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c7
-; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102
-; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[4:7]
-; GFX1013-NEXT: s_waitcnt vmcnt(0)
-; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
-; GFX1013-NEXT: s_endpgm
+; GFX10-LABEL: image_bvh64_intersect_ray_nsa_reassign:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX10-NEXT: v_mov_b32_e32 v3, 0
+; GFX10-NEXT: v_mov_b32_e32 v4, 1.0
+; GFX10-NEXT: v_mov_b32_e32 v5, 2.0
+; GFX10-NEXT: v_mov_b32_e32 v6, 0x40400000
+; GFX10-NEXT: v_mov_b32_e32 v7, 4.0
+; GFX10-NEXT: v_mov_b32_e32 v8, 0x40a00000
+; GFX10-NEXT: v_mov_b32_e32 v9, 0x40c00000
+; GFX10-NEXT: v_mov_b32_e32 v10, 0x40e00000
+; GFX10-NEXT: v_mov_b32_e32 v11, 0x41000000
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: flat_load_dword v2, v[0:1]
+; GFX10-NEXT: v_mov_b32_e32 v0, 0xb36211c7
+; GFX10-NEXT: v_mov_b32_e32 v1, 0x102
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[4:7]
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
+; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: image_bvh64_intersect_ray_nsa_reassign:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
; GFX11-NEXT: s_mov_b32 s16, 0xb36211c7
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX11-NEXT: s_movk_i32 s17, 0x102
+; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_mov_b32 s8, 0x40400000
; GFX11-NEXT: s_mov_b32 s12, 0x40c00000
-; GFX11-NEXT: s_mov_b32 s6, 2.0
+; GFX11-NEXT: s_mov_b32 s1, 1.0
; GFX11-NEXT: s_mov_b32 s10, 0x40a00000
; GFX11-NEXT: s_mov_b32 s9, 4.0
; GFX11-NEXT: s_mov_b32 s14, 0x41000000
@@ -921,18 +893,18 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
; GFX11-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_mov_b32 v4, s9
; GFX11-NEXT: v_mov_b32_e32 v7, s13
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s4
-; GFX11-NEXT: v_mov_b32_e32 v1, s5
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_mov_b32 s5, 1.0
+; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s2
+; GFX11-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-NEXT: s_mov_b32 s2, 2.0
; GFX11-NEXT: v_mov_b32_e32 v10, s17
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: flat_load_b32 v11, v[0:1]
-; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-NEXT: v_mov_b32_e32 v2, s6
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_mov_b32_e32 v2, s2
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[0:2], v[3:5], v[6:8]], s[0:3]
+; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[0:2], v[3:5], v[6:8]], s[4:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3]
; GFX11-NEXT: s_endpgm
@@ -954,84 +926,59 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
}
define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) {
-; GFX1030-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
-; GFX1030: ; %bb.0:
-; GFX1030-NEXT: s_clause 0x1
-; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
-; GFX1030-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX1030-NEXT: v_mov_b32_e32 v3, 0
-; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0
-; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0
-; GFX1030-NEXT: v_mov_b32_e32 v6, 0x44004200
-; GFX1030-NEXT: v_mov_b32_e32 v7, 0x46004500
-; GFX1030-NEXT: v_mov_b32_e32 v8, 0x48004700
-; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; GFX1030-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX1030-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX1030-NEXT: flat_load_dword v2, v[0:1]
-; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c6
-; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102
-; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16
-; GFX1030-NEXT: s_waitcnt vmcnt(0)
-; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
-; GFX1030-NEXT: s_endpgm
-;
-; GFX1013-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
-; GFX1013: ; %bb.0:
-; GFX1013-NEXT: s_clause 0x1
-; GFX1013-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX1013-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX1013-NEXT: v_mov_b32_e32 v3, 0
-; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0
-; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0
-; GFX1013-NEXT: v_mov_b32_e32 v6, 0x44004200
-; GFX1013-NEXT: v_mov_b32_e32 v7, 0x46004500
-; GFX1013-NEXT: v_mov_b32_e32 v8, 0x48004700
-; GFX1013-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1013-NEXT: v_mov_b32_e32 v0, s2
-; GFX1013-NEXT: v_mov_b32_e32 v1, s3
-; GFX1013-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX1013-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX1013-NEXT: flat_load_dword v2, v[0:1]
-; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c6
-; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102
-; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[4:7] a16
-; GFX1013-NEXT: s_waitcnt vmcnt(0)
-; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
-; GFX1013-NEXT: s_endpgm
+; GFX10-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX10-NEXT: v_mov_b32_e32 v3, 0
+; GFX10-NEXT: v_mov_b32_e32 v4, 1.0
+; GFX10-NEXT: v_mov_b32_e32 v5, 2.0
+; GFX10-NEXT: v_mov_b32_e32 v6, 0x44004200
+; GFX10-NEXT: v_mov_b32_e32 v7, 0x46004500
+; GFX10-NEXT: v_mov_b32_e32 v8, 0x48004700
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: flat_load_dword v2, v[0:1]
+; GFX10-NEXT: v_mov_b32_e32 v0, 0xb36211c6
+; GFX10-NEXT: v_mov_b32_e32 v1, 0x102
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[4:7] a16
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
+; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
; GFX11-NEXT: s_mov_b32 s12, 0xb36211c6
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX11-NEXT: s_movk_i32 s13, 0x102
-; GFX11-NEXT: s_mov_b32 s6, 2.0
+; GFX11-NEXT: s_mov_b32 s1, 1.0
+; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: s_mov_b32 s8, 0x42004600
; GFX11-NEXT: s_mov_b32 s9, 0x44004700
; GFX11-NEXT: s_mov_b32 s10, 0x45004800
; GFX11-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_mov_b32 v4, s9
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s4
-; GFX11-NEXT: v_mov_b32_e32 v1, s5
-; GFX11-NEXT: s_mov_b32 s5, 1.0
-; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: v_dual_mov_b32 v5, s10 :: v_dual_mov_b32 v0, s2
+; GFX11-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-NEXT: s_mov_b32 s2, 2.0
; GFX11-NEXT: v_dual_mov_b32 v6, s12 :: v_dual_mov_b32 v7, s13
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: flat_load_b32 v8, v[0:1]
-; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX11-NEXT: v_mov_b32_e32 v2, s6
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_mov_b32_e32 v2, s2
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[0:3] a16
+; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[4:7] a16
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3]
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
index e7faabb72ab69..66d1f5ff56f08 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
@@ -353,8 +353,8 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
; GCN-LABEL: test_mfma_f64_16x16x4f64_splat_lit:
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN-NEXT: s_mov_b32 s4, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GCN-NEXT: s_mov_b32 s5, 0x405ec000
; GCN-NEXT: s_mov_b64 s[6:7], s[4:5]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
@@ -369,7 +369,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
; GCN-NEXT: v_accvgpr_write_b32 a5, s9
; GCN-NEXT: v_accvgpr_write_b32 a6, s10
; GCN-NEXT: v_accvgpr_write_b32 a7, s11
-; GCN-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
+; GCN-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
; GCN-NEXT: v_mov_b32_e32 v0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
index c0cd068607200..f712df2ea9898 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll
@@ -8,14 +8,14 @@
define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in) {
; GFX8-LABEL: dpp_test:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -34,12 +34,12 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in) {
; GFX11-LABEL: dpp_test:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1 ; encoding: [0x01,0x00,0x85,0xbf]
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c ; encoding: [0x80,0x00,0x00,0xf4,0x2c,0x00,0x00,0xf8]
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; encoding: [0x00,0x00,0x04,0xf4,0x24,0x00,0x00,0xf8]
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c ; encoding: [0x00,0x01,0x00,0xf4,0x2c,0x00,0x00,0xf8]
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; encoding: [0x80,0x00,0x04,0xf4,0x24,0x00,0x00,0xf8]
; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; encoding: [0x80,0x00,0x10,0xca,0x02,0x00,0x00,0x01]
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 ; encoding: [0x80,0x00,0x10,0xca,0x04,0x00,0x00,0x01]
; GFX11-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11]
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; encoding: [0x00,0x00,0x6a,0xdc,0x01,0x00,0x00,0x00]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3] ; encoding: [0x00,0x00,0x6a,0xdc,0x01,0x00,0x02,0x00]
; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
@@ -50,38 +50,38 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in) {
define amdgpu_kernel void @mov_dpp64_test(ptr addrspace(1) %out, i64 %in1) {
; GFX8-LABEL: mov_dpp64_test:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: mov_dpp64_test:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; encoding: [0x00,0x00,0x08,0xf4,0x24,0x00,0x00,0xfa]
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; encoding: [0x00,0x01,0x08,0xf4,0x24,0x00,0x00,0xfa]
; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; encoding: [0x80,0x02,0x04,0x7e]
; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x7f,0xc0,0x8c,0xbf]
-; GFX10-NEXT: v_mov_b32_e32 v0, s2 ; encoding: [0x02,0x02,0x00,0x7e]
-; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; encoding: [0x03,0x02,0x02,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v0, s6 ; encoding: [0x06,0x02,0x00,0x7e]
+; GFX10-NEXT: v_mov_b32_e32 v1, s7 ; encoding: [0x07,0x02,0x02,0x7e]
; GFX10-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x00,0x11]
; GFX10-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x02,0x7e,0x01,0x01,0x00,0x11]
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; encoding: [0x00,0x80,0x74,0xdc,0x02,0x00,0x00,0x00]
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; encoding: [0x00,0x80,0x74,0xdc,0x02,0x00,0x04,0x00]
; GFX10-NEXT: s_endpgm ; encoding: [0x00,0x00,0x81,0xbf]
;
; GFX11-LABEL: mov_dpp64_test:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 ; encoding: [0x00,0x00,0x08,0xf4,0x24,0x00,0x00,0xf8]
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; encoding: [0x00,0x01,0x08,0xf4,0x24,0x00,0x00,0xf8]
; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; encoding: [0x80,0x02,0x04,0x7e]
; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; encoding: [0x07,0xfc,0x89,0xbf]
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; encoding: [0x02,0x00,0x10,0xca,0x03,0x00,0x00,0x00]
+; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; encoding: [0x06,0x00,0x10,0xca,0x07,0x00,0x00,0x00]
; GFX11-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x00,0x11]
; GFX11-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; encoding: [0xfa,0x02,0x02,0x7e,0x01,0x01,0x00,0x11]
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; encoding: [0x00,0x00,0x6e,0xdc,0x02,0x00,0x00,0x00]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; encoding: [0x00,0x00,0x6e,0xdc,0x02,0x00,0x04,0x00]
; GFX11-NEXT: s_nop 0 ; encoding: [0x00,0x00,0x80,0xbf]
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; encoding: [0x03,0x00,0xb6,0xbf]
; GFX11-NEXT: s_endpgm ; encoding: [0x00,0x00,0xb0,0xbf]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index fa24489df52dc..3d352db43fdf9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -4,16 +4,16 @@
define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) {
; GCN-LABEL: set_inactive:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 42
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
store i32 %tmp, ptr addrspace(1) %out
@@ -23,17 +23,17 @@ define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) {
define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) {
; GCN-LABEL: set_inactive_64:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0
store i64 %tmp, ptr addrspace(1) %out
@@ -47,32 +47,32 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_buffer_load_dword s2, s[4:7], 0x0
; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s2, 56
-; GCN-NEXT: s_cselect_b32 s4, 1, 0
+; GCN-NEXT: s_cselect_b32 s1, 1, 0
; GCN-NEXT: v_mov_b32_e32 v0, s3
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 42
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s2, 1
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
+; GCN-NEXT: s_mov_b32 s0, 1
+; GCN-NEXT: s_cmp_lg_u32 s1, 0
; GCN-NEXT: s_cbranch_scc0 .LBB2_2
; GCN-NEXT: ; %bb.1: ; %.one
; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0
-; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
-; GCN-NEXT: s_mov_b32 s2, 0
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dword v1, off, s[4:7], 0
+; GCN-NEXT: s_mov_b32 s0, 0
; GCN-NEXT: .LBB2_2: ; %Flow
-; GCN-NEXT: s_xor_b32 s2, s2, 1
-; GCN-NEXT: s_and_b32 s2, s2, 1
-; GCN-NEXT: s_cmp_lg_u32 s2, 0
+; GCN-NEXT: s_xor_b32 s0, s0, 1
+; GCN-NEXT: s_and_b32 s0, s0, 1
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_cbranch_scc1 .LBB2_4
; GCN-NEXT: ; %bb.3: ; %.zero
-; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: .LBB2_4: ; %.exit
; GCN-NEXT: s_endpgm
%val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 0, i32 0)
@@ -96,17 +96,17 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
; GCN-LABEL: set_inactive_f32:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, 0x40400000
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0
store float %tmp, ptr addrspace(1) %out
@@ -116,21 +116,21 @@ define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
; GCN-LABEL: set_inactive_f64:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s4, 0xcccccccd
-; GCN-NEXT: s_mov_b32 s5, 0x4010cccc
-; GCN-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s0, 0xcccccccd
+; GCN-NEXT: s_mov_b32 s1, 0x4010cccc
+; GCN-NEXT: v_mov_b32_e32 v3, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, v2
; GCN-NEXT: v_mov_b32_e32 v1, v3
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0
store double %tmp, ptr addrspace(1) %out
@@ -140,17 +140,17 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) {
; GCN-LABEL: set_inactive_v2i16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, 0x10001
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> <i16 1, i16 1>) #0
store <2 x i16> %tmp, ptr addrspace(1) %out
@@ -160,17 +160,17 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %
define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
; GCN-LABEL: set_inactive_v2f16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, 0x3c003c00
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> <half 1.0, half 1.0>) #0
store <2 x half> %tmp, ptr addrspace(1) %out
@@ -228,17 +228,17 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) {
; GCN-LABEL: set_inactive_v2bf16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, 0x3f803f80
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> <bfloat 1.0, bfloat 1.0>) #0
store <2 x bfloat> %tmp, ptr addrspace(1) %out
@@ -320,17 +320,17 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa
define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
; GCN-LABEL: set_inactive_p0:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0
store ptr %tmp, ptr addrspace(1) %out
@@ -340,16 +340,16 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) {
; GCN-LABEL: set_inactive_p2:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0
store ptr addrspace(2) %tmp, ptr addrspace(1) %out
@@ -359,16 +359,16 @@ define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) {
; GCN-LABEL: set_inactive_p3:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0
store ptr addrspace(3) %tmp, ptr addrspace(1) %out
@@ -378,16 +378,16 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) {
; GCN-LABEL: set_inactive_p5:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0
store ptr addrspace(5) %tmp, ptr addrspace(1) %out
@@ -397,16 +397,16 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) {
; GCN-LABEL: set_inactive_p6:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s3, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call ptr addrspace(6) @llvm.amdgcn.set.inactive.p6(ptr addrspace(6) %in, ptr addrspace(6) null) #0
store ptr addrspace(6) %tmp, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
index d6282708ece35..7f720e5c7766e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll
@@ -46,47 +46,47 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i64 %in2) {
; GFX8-LABEL: update_dppi64_test:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: update_dppi64_test:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
-; GFX10-NEXT: v_mov_b32_e32 v2, s2
-; GFX10-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
+; GFX10-NEXT: v_mov_b32_e32 v2, s6
+; GFX10-NEXT: v_mov_b32_e32 v3, s7
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: update_dppi64_test:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX11-NEXT: global_load_b64 v[0:1], v4, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1]
+; GFX11-NEXT: global_store_b64 v4, v[2:3], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -101,47 +101,47 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i
define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1, double %in2) {
; GFX8-LABEL: update_dppf64_test:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: update_dppf64_test:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
-; GFX10-NEXT: v_mov_b32_e32 v2, s2
-; GFX10-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
+; GFX10-NEXT: v_mov_b32_e32 v2, s6
+; GFX10-NEXT: v_mov_b32_e32 v3, s7
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: update_dppf64_test:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX11-NEXT: global_load_b64 v[0:1], v4, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1]
+; GFX11-NEXT: global_store_b64 v4, v[2:3], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -266,47 +266,47 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa
define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, ptr %in2) {
; GFX8-LABEL: update_dpp_p0_test:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
+; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: update_dpp_p0_test:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
-; GFX10-NEXT: v_mov_b32_e32 v2, s2
-; GFX10-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
+; GFX10-NEXT: v_mov_b32_e32 v2, s6
+; GFX10-NEXT: v_mov_b32_e32 v3, s7
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: update_dpp_p0_test:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX11-NEXT: global_load_b64 v[0:1], v4, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX11-NEXT: global_load_b64 v[0:1], v4, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
; GFX11-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
-; GFX11-NEXT: global_store_b64 v4, v[2:3], s[0:1]
+; GFX11-NEXT: global_store_b64 v4, v[2:3], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -321,13 +321,13 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p
define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspace(3) %in1, ptr %in2) {
; GFX8-LABEL: update_dpp_p3_test:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: ds_read_b32 v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
@@ -336,11 +336,11 @@ define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspa
;
; GFX10-LABEL: update_dpp_p3_test:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX10-NEXT: v_mov_b32_e32 v2, s1
+; GFX10-NEXT: v_add_nc_u32_e32 v0, s2, v0
+; GFX10-NEXT: v_mov_b32_e32 v2, s3
; GFX10-NEXT: ds_read_b32 v1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
@@ -349,11 +349,11 @@ define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspa
;
; GFX11-LABEL: update_dpp_p3_test:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX11-NEXT: v_mov_b32_e32 v2, s1
+; GFX11-NEXT: v_add_nc_u32_e32 v0, s2, v0
+; GFX11-NEXT: v_mov_b32_e32 v2, s3
; GFX11-NEXT: ds_load_b32 v1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
@@ -371,17 +371,17 @@ define amdgpu_kernel void @update_dpp_p5_test(ptr addrspace(5) %arg, ptr addrspa
; GFX8-LABEL: update_dpp_p5_test:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
; GFX8-NEXT: s_mov_b32 s90, -1
; GFX8-NEXT: s_mov_b32 s91, 0xe80000
; GFX8-NEXT: s_add_u32 s88, s88, s3
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: s_addc_u32 s89, s89, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_nop 0
; GFX8-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
@@ -390,17 +390,17 @@ define amdgpu_kernel void @update_dpp_p5_test(ptr addrspace(5) %arg, ptr addrspa
;
; GFX10-LABEL: update_dpp_p5_test:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
; GFX10-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_mov_b32 s7, 0x31c16000
; GFX10-NEXT: s_add_u32 s4, s4, s3
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_addc_u32 s5, s5, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX10-NEXT: v_mov_b32_e32 v2, s1
+; GFX10-NEXT: v_add_nc_u32_e32 v0, s2, v0
+; GFX10-NEXT: v_mov_b32_e32 v2, s3
; GFX10-NEXT: buffer_load_dword v1, v0, s[4:7], 0 offen
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
@@ -409,11 +409,11 @@ define amdgpu_kernel void @update_dpp_p5_test(ptr addrspace(5) %arg, ptr addrspa
;
; GFX11-LABEL: update_dpp_p5_test:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v0, s0, v0
-; GFX11-NEXT: v_mov_b32_e32 v2, s1
+; GFX11-NEXT: v_add_nc_u32_e32 v0, s2, v0
+; GFX11-NEXT: v_mov_b32_e32 v2, s3
; GFX11-NEXT: scratch_load_b32 v1, v0, off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_dpp v2, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
index 6bb104311a4d8..9251f262a5005 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
@@ -1017,7 +1017,9 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg
define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg %ptr) {
; GFX12-LABEL: s_load_constant_v3i32_align4:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
;
@@ -1054,7 +1056,9 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg
define amdgpu_ps i96 @s_load_constant_i96_align8(ptr addrspace(4) inreg %ptr) {
; GFX12-LABEL: s_load_constant_i96_align8:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
;
@@ -1091,7 +1095,9 @@ define amdgpu_ps i96 @s_load_constant_i96_align8(ptr addrspace(4) inreg %ptr) {
define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(ptr addrspace(4) inreg %ptr) {
; GFX12-LABEL: s_load_constant_v3i32_align8:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
;
@@ -1128,7 +1134,9 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(ptr addrspace(4) inreg
define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(ptr addrspace(4) inreg %ptr) {
; GFX12-LABEL: s_load_constant_v6i16_align8:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: ; return to shader part epilog
;
@@ -1166,7 +1174,9 @@ define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(ptr addrspace(4) inreg
define amdgpu_ps <12 x i8> @s_load_constant_v12i8_align8(ptr addrspace(4) inreg %ptr) {
; GFX12-LABEL: s_load_constant_v12i8_align8:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_lshr_b32 s13, s0, 8
; GFX12-NEXT: s_lshr_b32 s12, s0, 16
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
index 1140ef88ac7f8..e1fcca0089359 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
@@ -7,28 +7,28 @@ declare i32 @llvm.amdgcn.workitem.id.x()
define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
; GFX10-LABEL: v_mul_i64_no_zext:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v7, s[0:1]
-; GFX10-NEXT: global_load_dwordx2 v[2:3], v7, s[2:3]
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v7, s[4:5]
+; GFX10-NEXT: global_load_dwordx2 v[2:3], v7, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, v0, v2, 0
; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, v0, v3, v[5:6]
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v2, v[5:6]
; GFX10-NEXT: v_mov_b32_e32 v5, v0
-; GFX10-NEXT: global_store_dwordx2 v7, v[4:5], s[2:3]
+; GFX10-NEXT: global_store_dwordx2 v7, v[4:5], s[6:7]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul_i64_no_zext:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x2c
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
; GFX11-NEXT: v_lshlrev_b32_e32 v9, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b64 v[0:1], v9, s[0:1]
-; GFX11-NEXT: global_load_b64 v[2:3], v9, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v9, s[4:5]
+; GFX11-NEXT: global_load_b64 v[2:3], v9, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v0, v2, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -36,7 +36,7 @@ define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspac
; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v1, v2, v[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v5, v7
-; GFX11-NEXT: global_store_b64 v9, v[4:5], s[2:3]
+; GFX11-NEXT: global_store_b64 v9, v[4:5], s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -76,12 +76,12 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v1, s[6:7]
-; GFX11-NEXT: global_load_b32 v5, v2, s[0:1]
+; GFX11-NEXT: global_load_b32 v5, v2, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v0, v5, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -128,12 +128,12 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v5, v1, s[6:7]
-; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1]
+; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v5, v0, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -177,13 +177,13 @@ define amdgpu_kernel void @v_mul_i64_zext_src0_src1(ptr addrspace(1) %out, ptr a
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v1, v0, 0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
@@ -227,12 +227,12 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v5, v0, s[6:7]
-; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1]
+; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v5, v0, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -277,12 +277,12 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_lo(ptr addrspace(1) %out, ptr a
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
-; GFX11-NEXT: global_load_b64 v[2:3], v2, s[0:1]
+; GFX11-NEXT: global_load_b64 v[2:3], v2, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -327,12 +327,12 @@ define amdgpu_kernel void @v_mul_i64_masked_src1_lo(ptr addrspace(1) %out, ptr a
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
-; GFX11-NEXT: global_load_b64 v[1:2], v2, s[0:1]
+; GFX11-NEXT: global_load_b64 v[1:2], v2, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_lo_u32 v1, v0, v2
; GFX11-NEXT: v_mov_b32_e32 v0, 0
@@ -355,21 +355,21 @@ define amdgpu_kernel void @v_mul_i64_masked_src1_lo(ptr addrspace(1) %out, ptr a
define amdgpu_kernel void @v_mul_i64_masked_src0(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
; GFX10-LABEL: v_mul_i64_masked_src0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul_i64_masked_src0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -413,12 +413,12 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
-; GFX11-NEXT: global_load_b64 v[2:3], v2, s[0:1]
+; GFX11-NEXT: global_load_b64 v[2:3], v2, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_and_b32_e32 v7, 0xfff00000, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -450,21 +450,21 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out
define amdgpu_kernel void @v_mul64_masked_before_branch(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
; GFX10-LABEL: v_mul64_masked_before_branch:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul64_masked_before_branch:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -534,13 +534,13 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1)
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX11-NEXT: s_mov_b32 s0, exec_lo
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[2:3], v0, s[6:7]
-; GFX11-NEXT: global_load_b64 v[4:5], v0, s[0:1]
-; GFX11-NEXT: s_mov_b32 s0, exec_lo
+; GFX11-NEXT: global_load_b64 v[4:5], v0, s[2:3]
; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_cmpx_ge_u64_e32 0, v[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 2d81452f9ef38..35de4a3194b3b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -2559,76 +2559,76 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr
;
; GFX8-LABEL: s_mul_u64_zext_with_sregs:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v0, 0x50
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0
-; GFX8-NEXT: s_mulk_i32 s2, 0x50
-; GFX8-NEXT: v_readfirstlane_b32 s3, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
+; GFX8-NEXT: s_mulk_i32 s0, 0x50
+; GFX8-NEXT: v_readfirstlane_b32 s1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: s_mul_u64_zext_with_sregs:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s3, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mul_i32 s2, s3, 0x50
-; GFX9-NEXT: s_mul_hi_u32 s3, s3, 0x50
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_mul_i32 s0, s1, 0x50
+; GFX9-NEXT: s_mul_hi_u32 s1, s1, 0x50
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_mul_u64_zext_with_sregs:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s3, s[2:3], 0x0
+; GFX10-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mul_i32 s2, s3, 0x50
-; GFX10-NEXT: s_mul_hi_u32 s3, s3, 0x50
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_mul_i32 s0, s1, 0x50
+; GFX10-NEXT: s_mul_hi_u32 s1, s1, 0x50
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_mul_u64_zext_with_sregs:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s3, s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mul_i32 s2, s3, 0x50
-; GFX11-NEXT: s_mul_hi_u32 s3, s3, 0x50
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_mul_i32 s0, s1, 0x50
+; GFX11-NEXT: s_mul_hi_u32 s1, s1, 0x50
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: s_mul_u64_zext_with_sregs:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX12-NEXT: s_mov_b32 s3, 0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], 0x50
+; GFX12-NEXT: s_mul_u64 s[0:1], s[0:1], 0x50
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2738,88 +2738,88 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr
;
; GFX8-LABEL: s_mul_u64_sext_with_sregs:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v0, 0x50
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0
-; GFX8-NEXT: s_ashr_i32 s3, s2, 31
-; GFX8-NEXT: s_mulk_i32 s2, 0x50
-; GFX8-NEXT: s_mulk_i32 s3, 0x50
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_add_u32 s3, s3, s4
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
+; GFX8-NEXT: s_ashr_i32 s1, s0, 31
+; GFX8-NEXT: s_mulk_i32 s0, 0x50
+; GFX8-NEXT: s_mulk_i32 s1, 0x50
+; GFX8-NEXT: v_readfirstlane_b32 s2, v0
+; GFX8-NEXT: s_add_u32 s1, s1, s2
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: s_mul_u64_sext_with_sregs:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s3, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_ashr_i32 s4, s3, 31
-; GFX9-NEXT: s_mul_i32 s2, s3, 0x50
-; GFX9-NEXT: s_mul_hi_u32 s3, s3, 0x50
-; GFX9-NEXT: s_mulk_i32 s4, 0x50
-; GFX9-NEXT: s_add_u32 s3, s4, s3
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_ashr_i32 s2, s1, 31
+; GFX9-NEXT: s_mul_i32 s0, s1, 0x50
+; GFX9-NEXT: s_mul_hi_u32 s1, s1, 0x50
+; GFX9-NEXT: s_mulk_i32 s2, 0x50
+; GFX9-NEXT: s_add_u32 s1, s2, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_mul_u64_sext_with_sregs:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_ashr_i32 s3, s2, 31
-; GFX10-NEXT: s_mul_hi_u32 s4, s2, 0x50
-; GFX10-NEXT: s_mulk_i32 s3, 0x50
-; GFX10-NEXT: s_mulk_i32 s2, 0x50
-; GFX10-NEXT: s_add_i32 s3, s4, s3
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_ashr_i32 s1, s0, 31
+; GFX10-NEXT: s_mul_hi_u32 s2, s0, 0x50
+; GFX10-NEXT: s_mulk_i32 s1, 0x50
+; GFX10-NEXT: s_mulk_i32 s0, 0x50
+; GFX10-NEXT: s_add_i32 s1, s2, s1
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_mul_u64_sext_with_sregs:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_ashr_i32 s3, s2, 31
-; GFX11-NEXT: s_mul_hi_u32 s4, s2, 0x50
-; GFX11-NEXT: s_mulk_i32 s3, 0x50
-; GFX11-NEXT: s_mulk_i32 s2, 0x50
-; GFX11-NEXT: s_add_i32 s3, s4, s3
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_ashr_i32 s1, s0, 31
+; GFX11-NEXT: s_mul_hi_u32 s2, s0, 0x50
+; GFX11-NEXT: s_mulk_i32 s1, 0x50
+; GFX11-NEXT: s_mulk_i32 s0, 0x50
+; GFX11-NEXT: s_add_i32 s1, s2, s1
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: s_mul_u64_sext_with_sregs:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_ashr_i32 s3, s2, 31
+; GFX12-NEXT: s_ashr_i32 s1, s0, 31
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], 0x50
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: s_mul_u64 s[0:1], s[0:1], 0x50
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index c3bd56610d102..5d4f1f6522f1a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -145,25 +145,25 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1)
define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) {
; GFX8-LABEL: sdivrem_i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_ashr_i32 s2, s9, 31
-; GFX8-NEXT: s_ashr_i32 s12, s11, 31
-; GFX8-NEXT: s_add_u32 s0, s8, s2
-; GFX8-NEXT: s_addc_u32 s1, s9, s2
-; GFX8-NEXT: s_add_u32 s8, s10, s12
-; GFX8-NEXT: s_mov_b32 s13, s12
-; GFX8-NEXT: s_addc_u32 s9, s11, s12
-; GFX8-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
-; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s9
-; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s8
+; GFX8-NEXT: s_ashr_i32 s2, s13, 31
+; GFX8-NEXT: s_ashr_i32 s4, s15, 31
+; GFX8-NEXT: s_add_u32 s0, s12, s2
+; GFX8-NEXT: s_addc_u32 s1, s13, s2
+; GFX8-NEXT: s_add_u32 s6, s14, s4
+; GFX8-NEXT: s_mov_b32 s5, s4
+; GFX8-NEXT: s_addc_u32 s7, s15, s4
+; GFX8-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5]
+; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s7
+; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s6
; GFX8-NEXT: s_mov_b32 s3, s2
-; GFX8-NEXT: s_xor_b64 s[10:11], s[0:1], s[2:3]
+; GFX8-NEXT: s_xor_b64 s[12:13], s[0:1], s[2:3]
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX8-NEXT: s_sub_u32 s14, 0, s8
-; GFX8-NEXT: s_subb_u32 s15, 0, s9
+; GFX8-NEXT: s_sub_u32 s14, 0, s6
+; GFX8-NEXT: s_subb_u32 s15, 0, s7
; GFX8-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX8-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GFX8-NEXT: v_trunc_f32_e32 v2, v1
@@ -223,53 +223,53 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v2, s11, v0
-; GFX8-NEXT: v_mul_lo_u32 v3, s10, v1
-; GFX8-NEXT: v_mul_hi_u32 v4, s10, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, s11, v0
-; GFX8-NEXT: v_mul_hi_u32 v5, s11, v1
+; GFX8-NEXT: v_mul_lo_u32 v2, s13, v0
+; GFX8-NEXT: v_mul_lo_u32 v3, s12, v1
+; GFX8-NEXT: v_mul_hi_u32 v4, s12, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, s13, v0
+; GFX8-NEXT: v_mul_hi_u32 v5, s13, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v4, s11, v1
+; GFX8-NEXT: v_mul_lo_u32 v4, s13, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT: v_mul_hi_u32 v3, s10, v1
+; GFX8-NEXT: v_mul_hi_u32 v3, s12, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v4, v0
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v4, 0
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v6, s11
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s10, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v5, s9
+; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v3, v[1:2]
+; GFX8-NEXT: v_mov_b32_e32 v6, s13
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s12, v0
+; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v4, v[1:2]
+; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: v_subb_u32_e64 v2, s[0:1], v6, v1, vcc
-; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s11, v1
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2
+; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s13, v1
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v2
; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v0
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v0
; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v2
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v2
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[0:1]
-; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s8, v0
+; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s6, v0
; GFX8-NEXT: v_subbrev_u32_e64 v8, s[0:1], 0, v1, vcc
; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v4
; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v8
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v8
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v7
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v7
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v8
-; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s8, v7
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v8
+; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s6, v7
; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1]
; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v9
; GFX8-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
@@ -284,7 +284,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v5, v0, v5, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[0:1]
-; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[12:13]
+; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5]
; GFX8-NEXT: v_xor_b32_e32 v0, s0, v4
; GFX8-NEXT: v_xor_b32_e32 v1, s1, v3
; GFX8-NEXT: v_mov_b32_e32 v3, s1
@@ -295,35 +295,35 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: v_mov_b32_e32 v5, s2
; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s2, v3
; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v4, v5, vcc
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NEXT: v_mov_b32_e32 v4, s8
+; GFX8-NEXT: v_mov_b32_e32 v5, s9
; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NEXT: v_mov_b32_e32 v1, s11
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sdivrem_i64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_ashr_i32 s2, s9, 31
-; GFX9-NEXT: s_ashr_i32 s12, s11, 31
-; GFX9-NEXT: s_add_u32 s0, s8, s2
-; GFX9-NEXT: s_addc_u32 s1, s9, s2
-; GFX9-NEXT: s_add_u32 s8, s10, s12
-; GFX9-NEXT: s_mov_b32 s13, s12
-; GFX9-NEXT: s_addc_u32 s9, s11, s12
-; GFX9-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9
-; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s8
+; GFX9-NEXT: s_ashr_i32 s2, s13, 31
+; GFX9-NEXT: s_ashr_i32 s4, s15, 31
+; GFX9-NEXT: s_add_u32 s0, s12, s2
+; GFX9-NEXT: s_addc_u32 s1, s13, s2
+; GFX9-NEXT: s_add_u32 s6, s14, s4
+; GFX9-NEXT: s_mov_b32 s5, s4
+; GFX9-NEXT: s_addc_u32 s7, s15, s4
+; GFX9-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5]
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7
+; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6
; GFX9-NEXT: s_mov_b32 s3, s2
-; GFX9-NEXT: s_xor_b64 s[10:11], s[0:1], s[2:3]
+; GFX9-NEXT: s_xor_b64 s[12:13], s[0:1], s[2:3]
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_sub_u32 s14, 0, s8
-; GFX9-NEXT: s_subb_u32 s15, 0, s9
+; GFX9-NEXT: s_sub_u32 s14, 0, s6
+; GFX9-NEXT: s_subb_u32 s15, 0, s7
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GFX9-NEXT: v_trunc_f32_e32 v2, v1
@@ -357,7 +357,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v3, 0
-; GFX9-NEXT: v_mov_b32_e32 v7, s9
+; GFX9-NEXT: v_mov_b32_e32 v7, s7
; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v4, v[1:2]
; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0
; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v3, v[1:2]
@@ -382,52 +382,52 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v2, s11, v0
-; GFX9-NEXT: v_mul_lo_u32 v3, s10, v1
-; GFX9-NEXT: v_mul_hi_u32 v4, s10, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, s11, v0
-; GFX9-NEXT: v_mul_hi_u32 v6, s11, v1
+; GFX9-NEXT: v_mul_lo_u32 v2, s13, v0
+; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1
+; GFX9-NEXT: v_mul_hi_u32 v4, s12, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, s13, v0
+; GFX9-NEXT: v_mul_hi_u32 v6, s13, v1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v4, s11, v1
+; GFX9-NEXT: v_mul_lo_u32 v4, s13, v1
; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT: v_mul_hi_u32 v3, s10, v1
+; GFX9-NEXT: v_mul_hi_u32 v3, s12, v1
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v5, 0
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2]
-; GFX9-NEXT: v_mov_b32_e32 v6, s11
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s10, v0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s9, v5, v[1:2]
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s6, v3, v[1:2]
+; GFX9-NEXT: v_mov_b32_e32 v6, s13
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s12, v0
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v5, v[1:2]
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: v_subb_co_u32_e64 v2, s[0:1], v6, v1, vcc
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v2
-; GFX9-NEXT: v_sub_u32_e32 v1, s11, v1
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v2
+; GFX9-NEXT: v_sub_u32_e32 v1, s13, v1
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v0
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v0
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v2
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v2
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[0:1]
-; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s8, v0
+; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s6, v0
; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[0:1], 0, v1, vcc
; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v5
; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s9, v9
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s7, v9
; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s8, v8
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s6, v8
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v9
-; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s8, v8
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v9
+; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s6, v8
; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1]
; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v10
; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -442,7 +442,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v6, v0, v6, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v1, s[0:1]
-; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[12:13]
+; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], s[4:5]
; GFX9-NEXT: v_xor_b32_e32 v0, s0, v5
; GFX9-NEXT: v_xor_b32_e32 v1, s1, v3
; GFX9-NEXT: v_mov_b32_e32 v3, s1
@@ -453,27 +453,27 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-NEXT: v_mov_b32_e32 v6, s2
; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s2, v3
; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v6, vcc
-; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7]
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
+; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sdivrem_i64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_ashr_i32 s2, s9, 31
-; GFX10-NEXT: s_ashr_i32 s12, s11, 31
-; GFX10-NEXT: s_add_u32 s0, s8, s2
-; GFX10-NEXT: s_addc_u32 s1, s9, s2
-; GFX10-NEXT: s_add_u32 s8, s10, s12
-; GFX10-NEXT: s_mov_b32 s13, s12
-; GFX10-NEXT: s_addc_u32 s9, s11, s12
+; GFX10-NEXT: s_ashr_i32 s2, s13, 31
+; GFX10-NEXT: s_ashr_i32 s4, s15, 31
+; GFX10-NEXT: s_add_u32 s0, s12, s2
+; GFX10-NEXT: s_addc_u32 s1, s13, s2
+; GFX10-NEXT: s_add_u32 s6, s14, s4
+; GFX10-NEXT: s_mov_b32 s5, s4
+; GFX10-NEXT: s_addc_u32 s7, s15, s4
; GFX10-NEXT: s_mov_b32 s3, s2
-; GFX10-NEXT: s_xor_b64 s[8:9], s[8:9], s[12:13]
+; GFX10-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5]
; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
-; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s9
-; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s8
-; GFX10-NEXT: s_sub_u32 s10, 0, s8
+; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s7
+; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s6
+; GFX10-NEXT: s_sub_u32 s12, 0, s6
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
@@ -484,11 +484,12 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX10-NEXT: v_cvt_u32_f32_e32 v4, v2
; GFX10-NEXT: v_add_f32_e32 v0, v1, v0
; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v0
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s11, s10, v3, 0
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s11, s10, v4, v[1:2]
-; GFX10-NEXT: s_subb_u32 s11, 0, s9
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s13, s12, v3, 0
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s13, s12, v4, v[1:2]
+; GFX10-NEXT: s_subb_u32 s13, 0, s7
; GFX10-NEXT: v_mul_hi_u32 v6, v4, v0
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s14, s11, v3, v[1:2]
+; GFX10-NEXT: s_xor_b64 s[4:5], s[2:3], s[4:5]
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s14, s13, v3, v[1:2]
; GFX10-NEXT: v_mul_lo_u32 v2, v4, v0
; GFX10-NEXT: v_mul_hi_u32 v0, v3, v0
; GFX10-NEXT: v_mul_lo_u32 v5, v3, v1
@@ -510,28 +511,28 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, v3, v0
; GFX10-NEXT: v_add3_u32 v1, v5, v2, v1
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v4, v1, vcc_lo
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s14, s10, v3, 0
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s10, s10, v4, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s14, s12, v3, 0
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s12, s12, v4, v[1:2]
; GFX10-NEXT: v_mul_hi_u32 v6, v4, v0
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s10, s11, v3, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s12, s13, v3, v[1:2]
; GFX10-NEXT: v_mul_lo_u32 v2, v4, v0
; GFX10-NEXT: v_mul_hi_u32 v0, v3, v0
; GFX10-NEXT: v_mul_lo_u32 v5, v3, v1
; GFX10-NEXT: v_mul_lo_u32 v7, v4, v1
; GFX10-NEXT: v_mul_hi_u32 v8, v3, v1
; GFX10-NEXT: v_mul_hi_u32 v1, v4, v1
-; GFX10-NEXT: v_add_co_u32 v2, s10, v2, v5
-; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s10
-; GFX10-NEXT: v_add_co_u32 v6, s10, v7, v6
-; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s10
-; GFX10-NEXT: v_add_co_u32 v0, s10, v2, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s10
-; GFX10-NEXT: v_add_co_u32 v2, s10, v6, v8
-; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s10
+; GFX10-NEXT: v_add_co_u32 v2, s12, v2, v5
+; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s12
+; GFX10-NEXT: v_add_co_u32 v6, s12, v7, v6
+; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s12
+; GFX10-NEXT: v_add_co_u32 v0, s12, v2, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s12
+; GFX10-NEXT: v_add_co_u32 v2, s12, v6, v8
+; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s12
; GFX10-NEXT: v_add_nc_u32_e32 v0, v5, v0
; GFX10-NEXT: v_add_nc_u32_e32 v5, v7, v6
-; GFX10-NEXT: v_add_co_u32 v0, s10, v2, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s10
+; GFX10-NEXT: v_add_co_u32 v0, s12, v2, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s12
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v3, v0
; GFX10-NEXT: v_add3_u32 v1, v5, v2, v1
; GFX10-NEXT: v_mul_lo_u32 v2, s1, v0
@@ -540,71 +541,70 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX10-NEXT: v_mul_hi_u32 v0, s1, v0
; GFX10-NEXT: v_mul_lo_u32 v3, s0, v1
; GFX10-NEXT: v_mul_lo_u32 v5, s1, v1
-; GFX10-NEXT: v_add_co_u32 v2, s10, v2, v3
+; GFX10-NEXT: v_add_co_u32 v2, s12, v2, v3
; GFX10-NEXT: v_mul_hi_u32 v3, s0, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s10
-; GFX10-NEXT: v_add_co_u32 v2, s10, v2, v4
-; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s10
-; GFX10-NEXT: v_add_co_u32 v0, s10, v5, v0
-; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s10
+; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s12
+; GFX10-NEXT: v_add_co_u32 v2, s12, v2, v4
+; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s12
+; GFX10-NEXT: v_add_co_u32 v0, s12, v5, v0
+; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s12
; GFX10-NEXT: v_add_nc_u32_e32 v2, v6, v2
-; GFX10-NEXT: v_add_co_u32 v0, s10, v0, v3
-; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s10
-; GFX10-NEXT: v_add_co_u32 v5, s10, v0, v2
+; GFX10-NEXT: v_add_co_u32 v0, s12, v0, v3
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s12
+; GFX10-NEXT: v_add_co_u32 v5, s12, v0, v2
; GFX10-NEXT: v_mul_hi_u32 v2, s1, v1
-; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s10
+; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s12
; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v3
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s10, s8, v5, 0
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s12, s6, v5, 0
; GFX10-NEXT: v_add3_u32 v3, v3, v6, v2
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s10, s8, v3, v[1:2]
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s10, s9, v5, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s12, s6, v3, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s12, s7, v5, v[1:2]
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v5, 1
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v3, vcc_lo
; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0
; GFX10-NEXT: v_sub_nc_u32_e32 v6, s1, v1
; GFX10-NEXT: v_sub_co_ci_u32_e64 v1, s0, s1, v1, vcc_lo
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s9, v6, vcc_lo
-; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s8, v0
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s7, v6, vcc_lo
+; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s6, v0
; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc_lo
-; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v0, s8
+; GFX10-NEXT: v_sub_co_u32 v8, vcc_lo, v0, s6
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, s0, 0, v6, vcc_lo
-; GFX10-NEXT: v_cmp_le_u32_e64 s0, s9, v1
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s9, v6, vcc_lo
+; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v1
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v6, vcc_lo, s7, v6, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s0
-; GFX10-NEXT: v_cmp_le_u32_e64 s0, s8, v8
+; GFX10-NEXT: v_cmp_le_u32_e64 s0, s6, v8
; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0
-; GFX10-NEXT: v_cmp_le_u32_e64 s0, s9, v9
+; GFX10-NEXT: v_cmp_le_u32_e64 s0, s7, v9
; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, s0
; GFX10-NEXT: v_add_co_u32 v13, s0, v2, 1
; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s0, 0, v4, s0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v9
+; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, v9
; GFX10-NEXT: v_cndmask_b32_e64 v11, v12, v11, s0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s9, v1
+; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s7, v1
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11
; GFX10-NEXT: v_cndmask_b32_e64 v7, v10, v7, s0
-; GFX10-NEXT: v_sub_co_u32 v10, s0, v8, s8
+; GFX10-NEXT: v_sub_co_u32 v10, s0, v8, s6
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v6, s0, 0, v6, s0
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v13, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v7
; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v7, v8, v10, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc_lo
-; GFX10-NEXT: s_xor_b64 s[8:9], s[2:3], s[12:13]
; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v2, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v4, s0
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v7, s0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0
; GFX10-NEXT: v_mov_b32_e32 v4, 0
-; GFX10-NEXT: v_xor_b32_e32 v2, s8, v2
-; GFX10-NEXT: v_xor_b32_e32 v3, s9, v3
+; GFX10-NEXT: v_xor_b32_e32 v2, s4, v2
+; GFX10-NEXT: v_xor_b32_e32 v3, s5, v3
; GFX10-NEXT: v_xor_b32_e32 v5, s2, v0
; GFX10-NEXT: v_xor_b32_e32 v6, s2, v1
-; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v2, s8
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s9, v3, vcc_lo
+; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v2, s4
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s5, v3, vcc_lo
; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v5, s2
; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s2, v6, vcc_lo
-; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
-; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7]
+; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
+; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11]
; GFX10-NEXT: s_endpgm
%div = sdiv i64 %x, %y
store i64 %div, ptr addrspace(1) %out0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index 63a0d8afd4de0..51c213ed7eaa3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -112,12 +112,12 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1)
define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) {
; GFX8-LABEL: udivrem_i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s11
-; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s10
-; GFX8-NEXT: s_sub_u32 s2, 0, s10
-; GFX8-NEXT: s_subb_u32 s3, 0, s11
+; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s15
+; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s14
+; GFX8-NEXT: s_sub_u32 s2, 0, s14
+; GFX8-NEXT: s_subb_u32 s3, 0, s15
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
@@ -180,53 +180,53 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v2, s9, v0
-; GFX8-NEXT: v_mul_lo_u32 v3, s8, v1
-; GFX8-NEXT: v_mul_hi_u32 v4, s8, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, s9, v0
-; GFX8-NEXT: v_mul_hi_u32 v5, s9, v1
+; GFX8-NEXT: v_mul_lo_u32 v2, s13, v0
+; GFX8-NEXT: v_mul_lo_u32 v3, s12, v1
+; GFX8-NEXT: v_mul_hi_u32 v4, s12, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, s13, v0
+; GFX8-NEXT: v_mul_hi_u32 v5, s13, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v4, s9, v1
+; GFX8-NEXT: v_mul_lo_u32 v4, s13, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT: v_mul_hi_u32 v3, s8, v1
+; GFX8-NEXT: v_mul_hi_u32 v3, s12, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v4, v0
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v4, 0
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v4, 0
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v5, v2
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v3, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v6, s9
-; GFX8-NEXT: v_mov_b32_e32 v5, s11
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v4, v[1:2]
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s8, v0
+; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v3, v[1:2]
+; GFX8-NEXT: v_mov_b32_e32 v6, s13
+; GFX8-NEXT: v_mov_b32_e32 v5, s15
+; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v4, v[1:2]
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s12, v0
; GFX8-NEXT: v_subb_u32_e64 v6, s[0:1], v6, v1, vcc
-; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s9, v1
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v6
+; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s13, v1
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v6
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v2
; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v6
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v6
; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[0:1]
-; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s10, v2
+; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s14, v2
; GFX8-NEXT: v_subbrev_u32_e64 v8, s[0:1], 0, v0, vcc
; GFX8-NEXT: v_add_u32_e64 v9, s[0:1], 1, v4
; GFX8-NEXT: v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v8
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v8
; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v7
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v7
; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v8
-; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s10, v7
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v8
+; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, s14, v7
; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v12, s[0:1]
; GFX8-NEXT: v_add_u32_e64 v12, s[0:1], 1, v9
; GFX8-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v0, vcc
@@ -241,22 +241,22 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v14, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v4, s[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NEXT: v_mov_b32_e32 v4, s8
+; GFX8-NEXT: v_mov_b32_e32 v5, s9
; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NEXT: v_mov_b32_e32 v1, s11
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: udivrem_i64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s11
-; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s10
-; GFX9-NEXT: s_sub_u32 s2, 0, s10
-; GFX9-NEXT: s_subb_u32 s3, 0, s11
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s15
+; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s14
+; GFX9-NEXT: s_sub_u32 s2, 0, s14
+; GFX9-NEXT: s_subb_u32 s3, 0, s15
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
@@ -293,7 +293,7 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX9-NEXT: v_mov_b32_e32 v7, s11
+; GFX9-NEXT: v_mov_b32_e32 v7, s15
; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0
; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
@@ -318,52 +318,52 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v2, s9, v0
-; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1
-; GFX9-NEXT: v_mul_hi_u32 v4, s8, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0
-; GFX9-NEXT: v_mul_hi_u32 v6, s9, v1
+; GFX9-NEXT: v_mul_lo_u32 v2, s13, v0
+; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1
+; GFX9-NEXT: v_mul_hi_u32 v4, s12, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, s13, v0
+; GFX9-NEXT: v_mul_hi_u32 v6, s13, v1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v4, s9, v1
+; GFX9-NEXT: v_mul_lo_u32 v4, s13, v1
; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT: v_mul_hi_u32 v3, s8, v1
+; GFX9-NEXT: v_mul_hi_u32 v3, s12, v1
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v0, v2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s10, v5, 0
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s14, v5, 0
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
; GFX9-NEXT: v_add3_u32 v3, v3, v2, v6
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s10, v3, v[1:2]
-; GFX9-NEXT: v_mov_b32_e32 v6, s9
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v3, v[1:2]
+; GFX9-NEXT: v_mov_b32_e32 v6, s13
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s11, v5, v[1:2]
-; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s8, v0
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v5, v[1:2]
+; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s12, v0
; GFX9-NEXT: v_subb_co_u32_e64 v6, s[0:1], v6, v1, vcc
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v6
-; GFX9-NEXT: v_sub_u32_e32 v0, s9, v1
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v6
+; GFX9-NEXT: v_sub_u32_e32 v0, s13, v1
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v2
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v2
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v6
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v6
; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v7, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[0:1]
-; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s10, v2
+; GFX9-NEXT: v_subrev_co_u32_e32 v8, vcc, s14, v2
; GFX9-NEXT: v_subbrev_co_u32_e64 v9, s[0:1], 0, v0, vcc
; GFX9-NEXT: v_add_co_u32_e64 v10, s[0:1], 1, v5
; GFX9-NEXT: v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s11, v9
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v9
; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s10, v8
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v8
; GFX9-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v7, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s11, v9
-; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s10, v8
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v9
+; GFX9-NEXT: v_subrev_co_u32_e32 v7, vcc, s14, v8
; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[0:1]
; GFX9-NEXT: v_add_co_u32_e64 v13, s[0:1], 1, v10
; GFX9-NEXT: v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc
@@ -378,17 +378,17 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v15, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v5, s[0:1]
-; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7]
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
+; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: udivrem_i64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s11
-; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s10
-; GFX10-NEXT: s_sub_u32 s0, 0, s10
+; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s15
+; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s14
+; GFX10-NEXT: s_sub_u32 s0, 0, s14
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
@@ -401,7 +401,7 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX10-NEXT: v_cvt_u32_f32_e32 v3, v0
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, s0, v3, 0
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s1, s0, v4, v[1:2]
-; GFX10-NEXT: s_subb_u32 s1, 0, s11
+; GFX10-NEXT: s_subb_u32 s1, 0, s15
; GFX10-NEXT: v_mul_hi_u32 v6, v4, v0
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s2, s1, v3, v[1:2]
; GFX10-NEXT: v_mul_lo_u32 v2, v4, v0
@@ -449,14 +449,14 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v3, v0
; GFX10-NEXT: v_add3_u32 v1, v5, v2, v1
-; GFX10-NEXT: v_mul_lo_u32 v2, s9, v0
+; GFX10-NEXT: v_mul_lo_u32 v2, s13, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v4, v1, vcc_lo
-; GFX10-NEXT: v_mul_hi_u32 v4, s8, v0
-; GFX10-NEXT: v_mul_hi_u32 v0, s9, v0
-; GFX10-NEXT: v_mul_lo_u32 v3, s8, v1
-; GFX10-NEXT: v_mul_lo_u32 v5, s9, v1
+; GFX10-NEXT: v_mul_hi_u32 v4, s12, v0
+; GFX10-NEXT: v_mul_hi_u32 v0, s13, v0
+; GFX10-NEXT: v_mul_lo_u32 v3, s12, v1
+; GFX10-NEXT: v_mul_lo_u32 v5, s13, v1
; GFX10-NEXT: v_add_co_u32 v2, s0, v2, v3
-; GFX10-NEXT: v_mul_hi_u32 v3, s8, v1
+; GFX10-NEXT: v_mul_hi_u32 v3, s12, v1
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
; GFX10-NEXT: v_add_co_u32 v2, s0, v2, v4
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
@@ -466,38 +466,38 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX10-NEXT: v_add_co_u32 v0, s0, v0, v3
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
; GFX10-NEXT: v_add_co_u32 v5, s0, v0, v2
-; GFX10-NEXT: v_mul_hi_u32 v2, s9, v1
+; GFX10-NEXT: v_mul_hi_u32 v2, s13, v1
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
; GFX10-NEXT: v_add_nc_u32_e32 v3, v4, v3
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s10, v5, 0
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s14, v5, 0
; GFX10-NEXT: v_add3_u32 v3, v3, v6, v2
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s10, v3, v[1:2]
-; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s11, v5, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s14, v3, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s0, s15, v5, v[1:2]
; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v5, 1
; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v3, vcc_lo
-; GFX10-NEXT: v_sub_co_u32 v7, vcc_lo, s8, v0
-; GFX10-NEXT: v_sub_nc_u32_e32 v6, s9, v1
-; GFX10-NEXT: v_sub_co_ci_u32_e64 v8, s0, s9, v1, vcc_lo
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s11, v6, vcc_lo
-; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s10, v7
+; GFX10-NEXT: v_sub_co_u32 v7, vcc_lo, s12, v0
+; GFX10-NEXT: v_sub_nc_u32_e32 v6, s13, v1
+; GFX10-NEXT: v_sub_co_ci_u32_e64 v8, s0, s13, v1, vcc_lo
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s15, v6, vcc_lo
+; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s14, v7
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v7, s10
+; GFX10-NEXT: v_sub_co_u32 v6, vcc_lo, v7, s14
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v9, s0, 0, v0, vcc_lo
-; GFX10-NEXT: v_cmp_le_u32_e64 s0, s11, v8
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s11, v0, vcc_lo
+; GFX10-NEXT: v_cmp_le_u32_e64 s0, s15, v8
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v0, vcc_lo, s15, v0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, -1, s0
-; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v6
+; GFX10-NEXT: v_cmp_le_u32_e64 s0, s14, v6
; GFX10-NEXT: v_cndmask_b32_e64 v11, 0, -1, s0
-; GFX10-NEXT: v_cmp_le_u32_e64 s0, s11, v9
+; GFX10-NEXT: v_cmp_le_u32_e64 s0, s15, v9
; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, -1, s0
; GFX10-NEXT: v_add_co_u32 v13, s0, v2, 1
; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s0, 0, v4, s0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s11, v9
+; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s15, v9
; GFX10-NEXT: v_cndmask_b32_e64 v11, v12, v11, s0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s11, v8
+; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s15, v8
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v11
; GFX10-NEXT: v_cndmask_b32_e64 v1, v10, v1, s0
-; GFX10-NEXT: v_sub_co_u32 v10, s0, v6, s10
+; GFX10-NEXT: v_sub_co_u32 v10, s0, v6, s14
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v0, s0, 0, v0, s0
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v13, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v14, vcc_lo
@@ -509,8 +509,8 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX10-NEXT: v_cndmask_b32_e64 v1, v3, v4, s0
; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v6, s0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v8, v9, s0
-; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[4:5]
-; GFX10-NEXT: global_store_dwordx2 v10, v[2:3], s[6:7]
+; GFX10-NEXT: global_store_dwordx2 v10, v[0:1], s[8:9]
+; GFX10-NEXT: global_store_dwordx2 v10, v[2:3], s[10:11]
; GFX10-NEXT: s_endpgm
%div = udiv i64 %x, %y
store i64 %div, ptr addrspace(1) %out0
@@ -979,13 +979,13 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1
define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) {
; GFX8-LABEL: udivrem_v2i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x20
-; GFX8-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
+; GFX8-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x20
+; GFX8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s13
-; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s12
-; GFX8-NEXT: s_sub_u32 s2, 0, s12
-; GFX8-NEXT: s_subb_u32 s3, 0, s13
+; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s17
+; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s16
+; GFX8-NEXT: s_sub_u32 s2, 0, s16
+; GFX8-NEXT: s_subb_u32 s3, 0, s17
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
@@ -1025,12 +1025,12 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
; GFX8-NEXT: v_mul_hi_u32 v6, v3, v0
-; GFX8-NEXT: s_sub_u32 s2, 0, s14
+; GFX8-NEXT: s_sub_u32 s2, 0, s18
; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
; GFX8-NEXT: v_mul_lo_u32 v2, v4, v0
; GFX8-NEXT: v_mul_hi_u32 v0, v4, v0
; GFX8-NEXT: v_mul_lo_u32 v5, v3, v1
-; GFX8-NEXT: s_subb_u32 s3, 0, s15
+; GFX8-NEXT: s_subb_u32 s3, 0, s19
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v6
@@ -1050,46 +1050,46 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v3, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v2, s9, v0
-; GFX8-NEXT: v_mul_lo_u32 v3, s8, v1
-; GFX8-NEXT: v_mul_hi_u32 v4, s8, v0
-; GFX8-NEXT: v_mul_hi_u32 v0, s9, v0
+; GFX8-NEXT: v_mul_lo_u32 v2, s13, v0
+; GFX8-NEXT: v_mul_lo_u32 v3, s12, v1
+; GFX8-NEXT: v_mul_hi_u32 v4, s12, v0
+; GFX8-NEXT: v_mul_hi_u32 v0, s13, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v4
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v4, s9, v1
+; GFX8-NEXT: v_mul_lo_u32 v4, s13, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT: v_mul_hi_u32 v3, s8, v1
+; GFX8-NEXT: v_mul_hi_u32 v3, s12, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v4, v0
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v3
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v4, v3
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v0, v2
-; GFX8-NEXT: v_mul_hi_u32 v4, s9, v1
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s12, v6, 0
+; GFX8-NEXT: v_mul_hi_u32 v4, s13, v1
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v4, v2
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v7, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v3, s9
-; GFX8-NEXT: v_sub_u32_e32 v8, vcc, s8, v0
-; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s13, v6, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v4, s13
+; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v7, v[1:2]
+; GFX8-NEXT: v_mov_b32_e32 v3, s13
+; GFX8-NEXT: v_sub_u32_e32 v8, vcc, s12, v0
+; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s17, v6, v[1:2]
+; GFX8-NEXT: v_mov_b32_e32 v4, s17
; GFX8-NEXT: v_subb_u32_e64 v0, s[0:1], v3, v1, vcc
-; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s9, v1
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v0
+; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s13, v1
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s17, v0
; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v8
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s16, v8
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v0
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s17, v0
; GFX8-NEXT: v_cndmask_b32_e64 v9, v2, v3, s[0:1]
-; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s15
+; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s19
; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v1, v4, vcc
-; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s14
+; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s18
; GFX8-NEXT: v_mul_f32_e32 v2, 0x4f800000, v2
-; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s12, v8
+; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, s16, v8
; GFX8-NEXT: v_add_f32_e32 v1, v2, v1
; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1
; GFX8-NEXT: v_subbrev_u32_e64 v11, s[0:1], 0, v5, vcc
@@ -1101,13 +1101,13 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_add_f32_e32 v1, v2, v1
; GFX8-NEXT: v_cvt_u32_f32_e32 v15, v1
; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v7, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v11
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s17, v11
; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v10
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s16, v10
; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1]
; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v15, 0
; GFX8-NEXT: v_cvt_u32_f32_e32 v14, v14
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v11
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s17, v11
; GFX8-NEXT: v_cndmask_b32_e64 v16, v3, v16, s[0:1]
; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v14, v[2:3]
; GFX8-NEXT: v_add_u32_e64 v17, s[0:1], 1, v12
@@ -1116,7 +1116,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v5, v4, vcc
; GFX8-NEXT: v_mul_lo_u32 v4, v14, v1
; GFX8-NEXT: v_mul_lo_u32 v5, v15, v2
-; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s12, v10
+; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, s16, v10
; GFX8-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v3, vcc
; GFX8-NEXT: v_mul_hi_u32 v3, v15, v1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v5
@@ -1175,55 +1175,55 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v7
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v15, v3
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v14, v4, vcc
-; GFX8-NEXT: v_mul_lo_u32 v7, s11, v3
-; GFX8-NEXT: v_mul_lo_u32 v8, s10, v4
+; GFX8-NEXT: v_mul_lo_u32 v7, s15, v3
+; GFX8-NEXT: v_mul_lo_u32 v8, s14, v4
; GFX8-NEXT: v_cndmask_b32_e64 v6, v0, v6, s[0:1]
-; GFX8-NEXT: v_mul_hi_u32 v0, s10, v3
-; GFX8-NEXT: v_mul_hi_u32 v3, s11, v3
+; GFX8-NEXT: v_mul_hi_u32 v0, s14, v3
+; GFX8-NEXT: v_mul_hi_u32 v3, s15, v3
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX8-NEXT: v_mul_lo_u32 v7, s11, v4
+; GFX8-NEXT: v_mul_lo_u32 v7, s15, v4
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v8, v0
-; GFX8-NEXT: v_mul_hi_u32 v8, s10, v4
+; GFX8-NEXT: v_mul_hi_u32 v8, s14, v4
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v7, v3
; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v8
; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v8
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v3, v0
-; GFX8-NEXT: v_mul_hi_u32 v8, s11, v4
-; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s14, v9, 0
+; GFX8-NEXT: v_mul_hi_u32 v8, s15, v4
+; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s18, v9, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v7, v0
; GFX8-NEXT: v_add_u32_e32 v10, vcc, v8, v0
; GFX8-NEXT: v_mov_b32_e32 v0, v4
-; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s14, v10, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v4, s11
-; GFX8-NEXT: v_mov_b32_e32 v0, s15
-; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s15, v9, v[7:8]
-; GFX8-NEXT: v_sub_u32_e32 v8, vcc, s10, v3
+; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s18, v10, v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v4, s15
+; GFX8-NEXT: v_mov_b32_e32 v0, s19
+; GFX8-NEXT: v_mad_u64_u32 v[7:8], s[0:1], s19, v9, v[7:8]
+; GFX8-NEXT: v_sub_u32_e32 v8, vcc, s14, v3
; GFX8-NEXT: v_subb_u32_e64 v11, s[0:1], v4, v7, vcc
-; GFX8-NEXT: v_sub_u32_e64 v3, s[0:1], s11, v7
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v11
+; GFX8-NEXT: v_sub_u32_e64 v3, s[0:1], s15, v7
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v11
; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v8
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v8
; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v11
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v11
; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v0, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[0:1]
-; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s14, v8
+; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s18, v8
; GFX8-NEXT: v_subbrev_u32_e64 v12, s[0:1], 0, v3, vcc
; GFX8-NEXT: v_add_u32_e64 v13, s[0:1], 1, v9
; GFX8-NEXT: v_addc_u32_e64 v14, s[0:1], 0, v10, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v12
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v12
; GFX8-NEXT: v_cndmask_b32_e64 v15, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v7
+; GFX8-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v7
; GFX8-NEXT: v_subb_u32_e32 v0, vcc, v3, v0, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1]
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v12
-; GFX8-NEXT: v_subrev_u32_e32 v18, vcc, s14, v7
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v12
+; GFX8-NEXT: v_subrev_u32_e32 v18, vcc, s18, v7
; GFX8-NEXT: v_cndmask_b32_e64 v15, v15, v16, s[0:1]
; GFX8-NEXT: v_add_u32_e64 v16, s[0:1], 1, v13
; GFX8-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
@@ -1234,30 +1234,30 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4
; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v4, v10, v13, s[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v10, s5
+; GFX8-NEXT: v_mov_b32_e32 v10, s9
; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v18, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc
-; GFX8-NEXT: v_mov_b32_e32 v9, s4
+; GFX8-NEXT: v_mov_b32_e32 v9, s8
; GFX8-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[0:1]
; GFX8-NEXT: v_cndmask_b32_e64 v8, v11, v0, s[0:1]
; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[1:4]
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NEXT: v_mov_b32_e32 v1, s11
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[5:8]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: udivrem_v2i64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x20
+; GFX9-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x20
+; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s13
-; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s12
-; GFX9-NEXT: s_sub_u32 s2, 0, s12
-; GFX9-NEXT: s_subb_u32 s3, 0, s13
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s17
+; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s16
+; GFX9-NEXT: s_sub_u32 s2, 0, s16
+; GFX9-NEXT: s_subb_u32 s3, 0, s17
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
; GFX9-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX9-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GFX9-NEXT: v_trunc_f32_e32 v2, v1
@@ -1293,12 +1293,12 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
; GFX9-NEXT: v_mul_hi_u32 v6, v3, v0
-; GFX9-NEXT: s_sub_u32 s2, 0, s14
+; GFX9-NEXT: s_sub_u32 s2, 0, s18
; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
; GFX9-NEXT: v_mul_lo_u32 v2, v4, v0
; GFX9-NEXT: v_mul_hi_u32 v0, v4, v0
; GFX9-NEXT: v_mul_lo_u32 v5, v3, v1
-; GFX9-NEXT: s_subb_u32 s3, 0, s15
+; GFX9-NEXT: s_subb_u32 s3, 0, s19
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6
@@ -1317,48 +1317,47 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_add3_u32 v1, v5, v2, v1
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mul_lo_u32 v2, s9, v0
-; GFX9-NEXT: v_mul_lo_u32 v3, s8, v1
-; GFX9-NEXT: v_mul_hi_u32 v4, s8, v0
-; GFX9-NEXT: v_mul_hi_u32 v0, s9, v0
-; GFX9-NEXT: v_mul_hi_u32 v5, s9, v1
+; GFX9-NEXT: v_mul_lo_u32 v2, s13, v0
+; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1
+; GFX9-NEXT: v_mul_hi_u32 v4, s12, v0
+; GFX9-NEXT: v_mul_hi_u32 v0, s13, v0
+; GFX9-NEXT: v_mul_hi_u32 v5, s13, v1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v4, s9, v1
+; GFX9-NEXT: v_mul_lo_u32 v4, s13, v1
; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT: v_mul_hi_u32 v3, s8, v1
-; GFX9-NEXT: v_mov_b32_e32 v6, s13
+; GFX9-NEXT: v_mul_hi_u32 v3, s12, v1
+; GFX9-NEXT: v_mov_b32_e32 v6, s17
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v2
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s12, v8, 0
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s16, v8, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v3, v4, v3
; GFX9-NEXT: v_add3_u32 v9, v3, v0, v5
; GFX9-NEXT: v_mov_b32_e32 v0, v2
-; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s12, v9, v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v5, s9
+; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s16, v9, v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v5, s13
; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s13, v8, v[2:3]
-; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s8, v1
+; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s17, v8, v[2:3]
+; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s12, v1
; GFX9-NEXT: v_subb_co_u32_e64 v1, s[0:1], v5, v3, vcc
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v1
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s17, v1
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v2
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s16, v2
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v1
-; GFX9-NEXT: v_sub_u32_e32 v3, s9, v3
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s17, v1
+; GFX9-NEXT: v_sub_u32_e32 v3, s13, v3
; GFX9-NEXT: v_cndmask_b32_e64 v10, v4, v5, s[0:1]
-; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s15
+; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s19
; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v3, v6, vcc
-; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s14
+; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s18
; GFX9-NEXT: v_mul_f32_e32 v4, 0x4f800000, v4
-; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s12, v2
+; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s16, v2
; GFX9-NEXT: v_add_f32_e32 v3, v4, v3
; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v3
; GFX9-NEXT: v_subbrev_co_u32_e64 v12, s[0:1], 0, v7, vcc
@@ -1370,13 +1369,13 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_add_f32_e32 v3, v4, v3
; GFX9-NEXT: v_cvt_u32_f32_e32 v16, v3
; GFX9-NEXT: v_addc_co_u32_e64 v14, s[0:1], 0, v9, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s13, v12
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s17, v12
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s12, v11
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s16, v11
; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1]
; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s2, v16, 0
; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v15
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v12
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s17, v12
; GFX9-NEXT: v_cndmask_b32_e64 v17, v5, v17, s[0:1]
; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s2, v15, v[4:5]
; GFX9-NEXT: v_add_co_u32_e64 v18, s[0:1], 1, v13
@@ -1385,7 +1384,7 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v7, v6, vcc
; GFX9-NEXT: v_mul_lo_u32 v6, v15, v3
; GFX9-NEXT: v_mul_lo_u32 v7, v16, v4
-; GFX9-NEXT: v_subrev_co_u32_e32 v20, vcc, s12, v11
+; GFX9-NEXT: v_subrev_co_u32_e32 v20, vcc, s16, v11
; GFX9-NEXT: v_subbrev_co_u32_e32 v21, vcc, 0, v5, vcc
; GFX9-NEXT: v_mul_hi_u32 v5, v16, v3
; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7
@@ -1441,55 +1440,55 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_add3_u32 v6, v9, v8, v6
; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v16, v5
; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v15, v6, vcc
-; GFX9-NEXT: v_mul_lo_u32 v8, s11, v5
-; GFX9-NEXT: v_mul_lo_u32 v9, s10, v6
+; GFX9-NEXT: v_mul_lo_u32 v8, s15, v5
+; GFX9-NEXT: v_mul_lo_u32 v9, s14, v6
; GFX9-NEXT: v_cndmask_b32_e64 v7, v2, v7, s[0:1]
-; GFX9-NEXT: v_mul_hi_u32 v2, s10, v5
-; GFX9-NEXT: v_mul_hi_u32 v5, s11, v5
+; GFX9-NEXT: v_mul_hi_u32 v2, s14, v5
+; GFX9-NEXT: v_mul_hi_u32 v5, s15, v5
; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v9
; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v8, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: v_mul_lo_u32 v8, s11, v6
+; GFX9-NEXT: v_mul_lo_u32 v8, s15, v6
; GFX9-NEXT: v_add_u32_e32 v2, v9, v2
-; GFX9-NEXT: v_mul_hi_u32 v9, s10, v6
-; GFX9-NEXT: v_mul_hi_u32 v13, s11, v6
+; GFX9-NEXT: v_mul_hi_u32 v9, s14, v6
+; GFX9-NEXT: v_mul_hi_u32 v13, s15, v6
; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v8, v5
; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, v5, v9
; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v5, v2
-; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[2:3], s14, v12, 0
+; GFX9-NEXT: v_mad_u64_u32 v[5:6], s[2:3], s18, v12, 0
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v8, v1, v10, s[0:1]
; GFX9-NEXT: v_add_u32_e32 v1, v11, v9
; GFX9-NEXT: v_add3_u32 v9, v1, v2, v13
; GFX9-NEXT: v_mov_b32_e32 v1, v6
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v9, v[1:2]
-; GFX9-NEXT: v_mov_b32_e32 v10, s11
-; GFX9-NEXT: v_mov_b32_e32 v6, s15
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s15, v12, v[1:2]
-; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s10, v5
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s18, v9, v[1:2]
+; GFX9-NEXT: v_mov_b32_e32 v10, s15
+; GFX9-NEXT: v_mov_b32_e32 v6, s19
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s19, v12, v[1:2]
+; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s14, v5
; GFX9-NEXT: v_subb_co_u32_e64 v10, s[0:1], v10, v1, vcc
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v10
-; GFX9-NEXT: v_sub_u32_e32 v1, s11, v1
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v10
+; GFX9-NEXT: v_sub_u32_e32 v1, s15, v1
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v2
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v2
; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v10
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v10
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[0:1]
-; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s14, v2
+; GFX9-NEXT: v_subrev_co_u32_e32 v11, vcc, s18, v2
; GFX9-NEXT: v_subbrev_co_u32_e64 v13, s[0:1], 0, v1, vcc
; GFX9-NEXT: v_add_co_u32_e64 v14, s[0:1], 1, v12
; GFX9-NEXT: v_addc_co_u32_e64 v15, s[0:1], 0, v9, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s15, v13
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s19, v13
; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s14, v11
+; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s18, v11
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[0:1]
-; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s15, v13
-; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s14, v11
+; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], s19, v13
+; GFX9-NEXT: v_subrev_co_u32_e32 v19, vcc, s18, v11
; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[0:1]
; GFX9-NEXT: v_add_co_u32_e64 v17, s[0:1], 1, v14
; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -1504,22 +1503,24 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, v9, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v1, s[0:1]
-; GFX9-NEXT: global_store_dwordx4 v0, v[3:6], s[4:5]
-; GFX9-NEXT: global_store_dwordx4 v0, v[7:10], s[6:7]
+; GFX9-NEXT: global_store_dwordx4 v0, v[3:6], s[8:9]
+; GFX9-NEXT: global_store_dwordx4 v0, v[7:10], s[10:11]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: udivrem_v2i64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x20
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x20
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s13
-; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s15
-; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s12
-; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s14
-; GFX10-NEXT: s_sub_u32 s0, 0, s12
+; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s17
+; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s19
+; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s16
+; GFX10-NEXT: v_cvt_f32_u32_e32 v3, s18
+; GFX10-NEXT: s_sub_u32 s0, 0, s16
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f800000, v1
-; GFX10-NEXT: s_subb_u32 s1, 0, s13
+; GFX10-NEXT: s_subb_u32 s1, 0, s17
; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
@@ -1539,13 +1540,13 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX10-NEXT: v_cvt_u32_f32_e32 v7, v0
; GFX10-NEXT: v_cvt_u32_f32_e32 v8, v1
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s2, s0, v7, 0
-; GFX10-NEXT: s_sub_u32 s2, 0, s14
+; GFX10-NEXT: s_sub_u32 s2, 0, s18
; GFX10-NEXT: v_mad_u64_u32 v[2:3], s3, s2, v8, 0
; GFX10-NEXT: v_mul_hi_u32 v11, v9, v0
; GFX10-NEXT: v_mad_u64_u32 v[4:5], s3, s0, v9, v[1:2]
; GFX10-NEXT: v_mad_u64_u32 v[5:6], s3, s2, v10, v[3:4]
; GFX10-NEXT: v_mul_lo_u32 v6, v9, v0
-; GFX10-NEXT: s_subb_u32 s3, 0, s15
+; GFX10-NEXT: s_subb_u32 s3, 0, s19
; GFX10-NEXT: v_mad_u64_u32 v[3:4], s6, s1, v7, v[4:5]
; GFX10-NEXT: v_mul_hi_u32 v4, v7, v0
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s6, s3, v8, v[5:6]
@@ -1592,7 +1593,6 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s6, s0, v7, 0
; GFX10-NEXT: v_add_co_ci_u32_e32 v10, vcc_lo, v10, v2, vcc_lo
; GFX10-NEXT: v_mad_u64_u32 v[2:3], s6, s2, v8, 0
-; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
; GFX10-NEXT: v_mul_hi_u32 v11, v9, v0
; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, s0, v9, v[1:2]
; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, s2, v10, v[3:4]
@@ -1641,21 +1641,20 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX10-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v9, v3, vcc_lo
; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v8, v1
; GFX10-NEXT: v_add_co_ci_u32_e32 v0, vcc_lo, v10, v0, vcc_lo
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mul_lo_u32 v3, s9, v4
-; GFX10-NEXT: v_mul_lo_u32 v8, s8, v2
-; GFX10-NEXT: v_mul_hi_u32 v5, s8, v4
-; GFX10-NEXT: v_mul_hi_u32 v4, s9, v4
-; GFX10-NEXT: v_mul_lo_u32 v9, s9, v2
-; GFX10-NEXT: v_mul_lo_u32 v6, s11, v1
-; GFX10-NEXT: v_mul_hi_u32 v10, s8, v2
-; GFX10-NEXT: v_mul_hi_u32 v11, s9, v2
-; GFX10-NEXT: v_mul_lo_u32 v2, s10, v0
-; GFX10-NEXT: v_mul_hi_u32 v7, s10, v1
-; GFX10-NEXT: v_mul_hi_u32 v1, s11, v1
-; GFX10-NEXT: v_mul_lo_u32 v12, s11, v0
-; GFX10-NEXT: v_mul_hi_u32 v13, s10, v0
-; GFX10-NEXT: v_mul_hi_u32 v14, s11, v0
+; GFX10-NEXT: v_mul_lo_u32 v3, s13, v4
+; GFX10-NEXT: v_mul_lo_u32 v8, s12, v2
+; GFX10-NEXT: v_mul_hi_u32 v5, s12, v4
+; GFX10-NEXT: v_mul_hi_u32 v4, s13, v4
+; GFX10-NEXT: v_mul_lo_u32 v9, s13, v2
+; GFX10-NEXT: v_mul_lo_u32 v6, s15, v1
+; GFX10-NEXT: v_mul_hi_u32 v10, s12, v2
+; GFX10-NEXT: v_mul_hi_u32 v11, s13, v2
+; GFX10-NEXT: v_mul_lo_u32 v2, s14, v0
+; GFX10-NEXT: v_mul_hi_u32 v7, s14, v1
+; GFX10-NEXT: v_mul_hi_u32 v1, s15, v1
+; GFX10-NEXT: v_mul_lo_u32 v12, s15, v0
+; GFX10-NEXT: v_mul_hi_u32 v13, s14, v0
+; GFX10-NEXT: v_mul_hi_u32 v14, s15, v0
; GFX10-NEXT: v_add_co_u32 v0, s0, v3, v8
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
; GFX10-NEXT: v_add_co_u32 v4, s0, v9, v4
@@ -1678,77 +1677,77 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX10-NEXT: v_add_co_u32 v8, s0, v4, v0
; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0
; GFX10-NEXT: v_add_co_u32 v10, s0, v1, v2
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, s12, v8, 0
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, s16, v8, 0
; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
-; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, s14, v10, 0
+; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, s18, v10, 0
; GFX10-NEXT: v_add_nc_u32_e32 v7, v9, v7
; GFX10-NEXT: v_add3_u32 v9, v5, v4, v11
; GFX10-NEXT: v_add_co_u32 v12, vcc_lo, v8, 1
; GFX10-NEXT: v_mov_b32_e32 v11, 0
; GFX10-NEXT: v_add3_u32 v7, v7, v6, v14
-; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, s12, v9, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, s16, v9, v[1:2]
; GFX10-NEXT: v_add_co_ci_u32_e32 v13, vcc_lo, 0, v9, vcc_lo
-; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, s14, v7, v[3:4]
-; GFX10-NEXT: v_mad_u64_u32 v[3:4], s0, s13, v8, v[4:5]
+; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, s18, v7, v[3:4]
+; GFX10-NEXT: v_mad_u64_u32 v[3:4], s0, s17, v8, v[4:5]
; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v12, 1
; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, 0, v13, vcc_lo
-; GFX10-NEXT: v_sub_co_u32 v14, vcc_lo, s8, v0
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s15, v10, v[5:6]
-; GFX10-NEXT: v_sub_co_ci_u32_e64 v5, s0, s9, v3, vcc_lo
-; GFX10-NEXT: v_cmp_le_u32_e64 s0, s12, v14
-; GFX10-NEXT: v_sub_nc_u32_e32 v1, s9, v3
+; GFX10-NEXT: v_sub_co_u32 v14, vcc_lo, s12, v0
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, s19, v10, v[5:6]
+; GFX10-NEXT: v_sub_co_ci_u32_e64 v5, s0, s13, v3, vcc_lo
+; GFX10-NEXT: v_cmp_le_u32_e64 s0, s16, v14
+; GFX10-NEXT: v_sub_nc_u32_e32 v1, s13, v3
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, -1, s0
-; GFX10-NEXT: v_sub_co_u32 v15, s0, s10, v2
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s13, v1, vcc_lo
-; GFX10-NEXT: v_sub_co_ci_u32_e64 v16, s1, s11, v0, s0
-; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s14, v15
-; GFX10-NEXT: v_sub_nc_u32_e32 v0, s11, v0
+; GFX10-NEXT: v_sub_co_u32 v15, s0, s14, v2
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s17, v1, vcc_lo
+; GFX10-NEXT: v_sub_co_ci_u32_e64 v16, s1, s15, v0, s0
+; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s18, v15
+; GFX10-NEXT: v_sub_nc_u32_e32 v0, s15, v0
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc_lo
-; GFX10-NEXT: v_sub_co_u32 v17, vcc_lo, v14, s12
+; GFX10-NEXT: v_sub_co_u32 v17, vcc_lo, v14, s16
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, s1, 0, v1, vcc_lo
-; GFX10-NEXT: v_cmp_le_u32_e64 s1, s13, v5
-; GFX10-NEXT: v_subrev_co_ci_u32_e64 v23, s0, s15, v0, s0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s13, v18
-; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s13, v1, vcc_lo
+; GFX10-NEXT: v_cmp_le_u32_e64 s1, s17, v5
+; GFX10-NEXT: v_subrev_co_ci_u32_e64 v23, s0, s19, v0, s0
+; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s17, v18
+; GFX10-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s17, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v19, 0, -1, s1
-; GFX10-NEXT: v_cmp_le_u32_e64 s1, s12, v17
+; GFX10-NEXT: v_cmp_le_u32_e64 s1, s16, v17
; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, -1, s1
-; GFX10-NEXT: v_cmp_le_u32_e64 s1, s13, v18
+; GFX10-NEXT: v_cmp_le_u32_e64 s1, s17, v18
; GFX10-NEXT: v_cndmask_b32_e64 v21, 0, -1, s1
-; GFX10-NEXT: v_cmp_le_u32_e64 s1, s15, v16
+; GFX10-NEXT: v_cmp_le_u32_e64 s1, s19, v16
; GFX10-NEXT: v_cndmask_b32_e64 v0, v21, v20, s0
-; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s13, v5
+; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s17, v5
; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, -1, s1
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: v_cndmask_b32_e64 v3, v19, v3, s0
-; GFX10-NEXT: v_sub_co_u32 v0, s0, v17, s12
+; GFX10-NEXT: v_sub_co_u32 v0, s0, v17, s16
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v19, s0, 0, v1, s0
; GFX10-NEXT: v_cndmask_b32_e32 v1, v12, v4, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v3
; GFX10-NEXT: v_cndmask_b32_e32 v3, v13, v6, vcc_lo
-; GFX10-NEXT: v_sub_co_u32 v6, s1, v15, s14
+; GFX10-NEXT: v_sub_co_u32 v6, s1, v15, s18
; GFX10-NEXT: v_cndmask_b32_e32 v4, v17, v0, vcc_lo
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v12, s2, 0, v23, s1
; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v1, s0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, v3, s0
; GFX10-NEXT: v_cndmask_b32_e32 v3, v18, v19, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s15, v16
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s19, v16
; GFX10-NEXT: v_cndmask_b32_e64 v4, v14, v4, s0
; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v3, s0
; GFX10-NEXT: v_cndmask_b32_e32 v2, v22, v2, vcc_lo
-; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s15, v12
+; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s19, v12
; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc_lo
-; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s14, v6
+; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s18, v6
; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc_lo
; GFX10-NEXT: v_add_co_u32 v13, vcc_lo, v10, 1
; GFX10-NEXT: v_add_co_ci_u32_e32 v14, vcc_lo, 0, v7, vcc_lo
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s15, v12
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, s19, v12
; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc_lo
; GFX10-NEXT: v_add_co_u32 v9, vcc_lo, v13, 1
; GFX10-NEXT: v_add_co_ci_u32_e32 v17, vcc_lo, 0, v14, vcc_lo
-; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, vcc_lo, s15, v23, s1
+; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, vcc_lo, s19, v23, s1
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8
-; GFX10-NEXT: v_sub_co_u32 v8, s1, v6, s14
+; GFX10-NEXT: v_sub_co_u32 v8, s1, v6, s18
; GFX10-NEXT: v_subrev_co_ci_u32_e64 v18, s1, 0, v18, s1
; GFX10-NEXT: v_cndmask_b32_e32 v9, v13, v9, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v13, v14, v17, vcc_lo
@@ -1759,8 +1758,8 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v13, s1
; GFX10-NEXT: v_cndmask_b32_e64 v6, v15, v6, s1
; GFX10-NEXT: v_cndmask_b32_e64 v7, v16, v8, s1
-; GFX10-NEXT: global_store_dwordx4 v11, v[0:3], s[4:5]
-; GFX10-NEXT: global_store_dwordx4 v11, v[4:7], s[6:7]
+; GFX10-NEXT: global_store_dwordx4 v11, v[0:3], s[8:9]
+; GFX10-NEXT: global_store_dwordx4 v11, v[4:7], s[10:11]
; GFX10-NEXT: s_endpgm
%div = udiv <2 x i64> %x, %y
store <2 x i64> %div, ptr addrspace(1) %out0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll
index 037210a496d6d..a2439e53ae665 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll
@@ -6,36 +6,36 @@
define amdgpu_kernel void @constant_load_i8_align4(ptr addrspace (1) %out, ptr addrspace(4) %in) #0 {
; GFX8-LABEL: constant_load_i8_align4:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: constant_load_i8_align4:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: global_store_byte v1, v0, s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: global_store_byte v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: constant_load_i8_align4:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: global_store_byte v1, v0, s[0:1]
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: global_store_byte v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
%ld = load i8, ptr addrspace(4) %in, align 4
store i8 %ld, ptr addrspace(1) %out, align 4
@@ -45,36 +45,36 @@ define amdgpu_kernel void @constant_load_i8_align4(ptr addrspace (1) %out, ptr a
define amdgpu_kernel void @constant_load_i16_align4(ptr addrspace (1) %out, ptr addrspace(4) %in) #0 {
; GFX8-LABEL: constant_load_i16_align4:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: constant_load_i16_align4:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: global_store_short v1, v0, s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: global_store_short v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: constant_load_i16_align4:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: global_store_short v1, v0, s[0:1]
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: global_store_short v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
%ld = load i16, ptr addrspace(4) %in, align 4
store i16 %ld, ptr addrspace(1) %out, align 4
@@ -84,39 +84,39 @@ define amdgpu_kernel void @constant_load_i16_align4(ptr addrspace (1) %out, ptr
define amdgpu_kernel void @sextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX8-LABEL: sextload_i8_to_i32_align4:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_sext_i32_i8 s2, s2
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: s_sext_i32_i8 s0, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sextload_i8_to_i32_align4:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sext_i32_i8 s2, s2
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_sext_i32_i8 s0, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sextload_i8_to_i32_align4:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_sext_i32_i8 s2, s2
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: s_sext_i32_i8 s0, s0
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
%load = load i8, ptr addrspace(1) %in, align 4
%sext = sext i8 %load to i32
@@ -127,39 +127,39 @@ define amdgpu_kernel void @sextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr
define amdgpu_kernel void @sextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX8-LABEL: sextload_i16_to_i32_align4:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_sext_i32_i16 s2, s2
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: s_sext_i32_i16 s0, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sextload_i16_to_i32_align4:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sext_i32_i16 s2, s2
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_sext_i32_i16 s0, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sextload_i16_to_i32_align4:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_sext_i32_i16 s2, s2
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: s_sext_i32_i16 s0, s0
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
%load = load i16, ptr addrspace(1) %in, align 4
%sext = sext i16 %load to i32
@@ -170,39 +170,39 @@ define amdgpu_kernel void @sextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr
define amdgpu_kernel void @zextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX8-LABEL: zextload_i8_to_i32_align4:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_and_b32 s2, s2, 0xff
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: s_and_b32 s0, s0, 0xff
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: zextload_i8_to_i32_align4:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s2, s2, 0xff
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_and_b32 s0, s0, 0xff
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: zextload_i8_to_i32_align4:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_and_b32 s2, s2, 0xff
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: s_and_b32 s0, s0, 0xff
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
%load = load i8, ptr addrspace(1) %in, align 4
%zext = zext i8 %load to i32
@@ -213,39 +213,39 @@ define amdgpu_kernel void @zextload_i8_to_i32_align4(ptr addrspace(1) %out, ptr
define amdgpu_kernel void @zextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX8-LABEL: zextload_i16_to_i32_align4:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: zextload_i16_to_i32_align4:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: zextload_i16_to_i32_align4:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
%load = load i16, ptr addrspace(1) %in, align 4
%zext = zext i16 %load to i32
@@ -256,35 +256,35 @@ define amdgpu_kernel void @zextload_i16_to_i32_align4(ptr addrspace(1) %out, ptr
define amdgpu_kernel void @constant_load_i8_align2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX8-LABEL: constant_load_i8_align2:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: constant_load_i8_align2:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3]
+; GFX9-NEXT: global_load_ubyte v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_byte v0, v1, s[0:1]
+; GFX9-NEXT: global_store_byte v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: constant_load_i8_align2:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3]
+; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_byte v0, v1, s[0:1]
+; GFX10-NEXT: global_store_byte v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
%load = load i8, ptr addrspace(1) %in, align 2
store i8 %load, ptr addrspace(1) %out, align 2
@@ -294,35 +294,35 @@ define amdgpu_kernel void @constant_load_i8_align2(ptr addrspace(1) %out, ptr ad
define amdgpu_kernel void @constant_load_i16_align2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX8-LABEL: constant_load_i16_align2:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ushort v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: constant_load_i16_align2:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: constant_load_i16_align2:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
%load = load i16, ptr addrspace(1) %in, align 2
store i16 %load, ptr addrspace(1) %out, align 2
@@ -332,43 +332,43 @@ define amdgpu_kernel void @constant_load_i16_align2(ptr addrspace(1) %out, ptr a
define amdgpu_kernel void @constant_sextload_i8_align2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX8-LABEL: constant_sextload_i8_align2:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_sbyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: s_add_u32 s2, s0, 2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: s_add_u32 s0, s4, 2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_short v[0:1], v2
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_short v[0:1], v3
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: constant_sextload_i8_align2:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_sbyte v1, v0, s[2:3]
+; GFX9-NEXT: global_load_sbyte v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
-; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:2
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
+; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[4:5] offset:2
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: constant_sextload_i8_align2:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_sbyte v1, v0, s[2:3]
+; GFX10-NEXT: global_load_sbyte v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
-; GFX10-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:2
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
+; GFX10-NEXT: global_store_short_d16_hi v0, v1, s[4:5] offset:2
; GFX10-NEXT: s_endpgm
%load = load i8, ptr addrspace(1) %in, align 2
%sextload = sext i8 %load to i32
@@ -379,43 +379,43 @@ define amdgpu_kernel void @constant_sextload_i8_align2(ptr addrspace(1) %out, pt
define amdgpu_kernel void @constant_zextload_i8_align2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX8-LABEL: constant_zextload_i8_align2:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: s_add_u32 s2, s0, 2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: s_add_u32 s0, s4, 2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_short v[0:1], v2
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_short v[0:1], v3
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: constant_zextload_i8_align2:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3]
+; GFX9-NEXT: global_load_ubyte v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
-; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:2
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
+; GFX9-NEXT: global_store_short_d16_hi v0, v1, s[4:5] offset:2
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: constant_zextload_i8_align2:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ubyte v1, v0, s[2:3]
+; GFX10-NEXT: global_load_ubyte v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
-; GFX10-NEXT: global_store_short_d16_hi v0, v1, s[0:1] offset:2
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
+; GFX10-NEXT: global_store_short_d16_hi v0, v1, s[4:5] offset:2
; GFX10-NEXT: s_endpgm
%load = load i8, ptr addrspace(1) %in, align 2
%zextload = zext i8 %load to i32
diff --git a/llvm/test/CodeGen/AMDGPU/add.ll b/llvm/test/CodeGen/AMDGPU/add.ll
index 422e2747094ce..cdf03ae2c70dc 100644
--- a/llvm/test/CodeGen/AMDGPU/add.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.ll
@@ -22,65 +22,65 @@ define amdgpu_kernel void @s_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
;
; GFX8-LABEL: s_add_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_add_i32 s0, s2, s3
+; GFX8-NEXT: s_add_i32 s0, s0, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: s_add_i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_i32 s2, s2, s3
-; GFX9-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_add_i32 s0, s0, s1
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_add_i32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_i32 s2, s2, s3
-; GFX10-NEXT: v_mov_b32_e32 v1, s2
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: s_add_i32 s0, s0, s1
+; GFX10-NEXT: v_mov_b32_e32 v1, s0
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_add_i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_i32 s2, s2, s3
+; GFX11-NEXT: s_add_i32 s0, s0, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: s_add_i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_add_co_i32 s2, s2, s3
+; GFX12-NEXT: s_add_co_i32 s0, s0, s1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -110,75 +110,75 @@ define amdgpu_kernel void @s_add_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX8-LABEL: s_add_v2i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_add_i32 s0, s5, s7
-; GFX8-NEXT: s_add_i32 s1, s4, s6
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NEXT: s_add_i32 s1, s1, s3
+; GFX8-NEXT: s_add_i32 s0, s0, s2
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: s_add_v2i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_i32 s2, s5, s7
-; GFX9-NEXT: s_add_i32 s3, s4, s6
-; GFX9-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_add_i32 s1, s1, s3
+; GFX9-NEXT: s_add_i32 s0, s0, s2
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_add_v2i32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_i32 s2, s4, s6
-; GFX10-NEXT: s_add_i32 s3, s5, s7
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: s_add_i32 s0, s0, s2
+; GFX10-NEXT: s_add_i32 s1, s1, s3
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_add_v2i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX11-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_i32 s2, s4, s6
-; GFX11-NEXT: s_add_i32 s3, s5, s7
+; GFX11-NEXT: s_add_i32 s0, s0, s2
+; GFX11-NEXT: s_add_i32 s1, s1, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: s_add_v2i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_add_co_i32 s2, s4, s6
-; GFX12-NEXT: s_add_co_i32 s3, s5, s7
+; GFX12-NEXT: s_add_co_i32 s0, s0, s2
+; GFX12-NEXT: s_add_co_i32 s1, s1, s3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -342,42 +342,42 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x
; GFX8-LABEL: s_add_v8i32:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_i32 s7, s7, s15
; GFX8-NEXT: s_add_i32 s6, s6, s14
; GFX8-NEXT: s_add_i32 s5, s5, s13
; GFX8-NEXT: s_add_i32 s4, s4, s12
-; GFX8-NEXT: s_add_i32 s2, s11, s19
-; GFX8-NEXT: s_add_i32 s3, s10, s18
+; GFX8-NEXT: s_add_i32 s0, s11, s19
+; GFX8-NEXT: s_add_i32 s1, s10, s18
; GFX8-NEXT: s_add_i32 s9, s9, s17
; GFX8-NEXT: s_add_i32 s8, s8, s16
-; GFX8-NEXT: v_mov_b32_e32 v3, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NEXT: s_add_u32 s0, s2, 16
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: v_mov_b32_e32 v1, s9
-; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: v_mov_b32_e32 v3, s7
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: s_add_v8i32:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_i32 s2, s7, s15
-; GFX9-NEXT: s_add_i32 s3, s6, s14
+; GFX9-NEXT: s_add_i32 s0, s7, s15
+; GFX9-NEXT: s_add_i32 s1, s6, s14
; GFX9-NEXT: s_add_i32 s6, s11, s19
; GFX9-NEXT: s_add_i32 s7, s10, s18
; GFX9-NEXT: s_add_i32 s9, s9, s17
@@ -388,23 +388,24 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x
; GFX9-NEXT: v_mov_b32_e32 v1, s9
; GFX9-NEXT: v_mov_b32_e32 v2, s7
; GFX9-NEXT: v_mov_b32_e32 v3, s6
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_mov_b32_e32 v2, s3
-; GFX9-NEXT: v_mov_b32_e32 v3, s2
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_add_v8i32:
; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v8, 0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_i32 s2, s7, s15
-; GFX10-NEXT: s_add_i32 s3, s6, s14
+; GFX10-NEXT: s_add_i32 s0, s7, s15
+; GFX10-NEXT: s_add_i32 s1, s6, s14
; GFX10-NEXT: s_add_i32 s6, s11, s19
; GFX10-NEXT: s_add_i32 s7, s10, s18
; GFX10-NEXT: s_add_i32 s8, s8, s16
@@ -417,20 +418,20 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x
; GFX10-NEXT: v_mov_b32_e32 v3, s6
; GFX10-NEXT: v_mov_b32_e32 v4, s4
; GFX10-NEXT: v_mov_b32_e32 v5, s5
-; GFX10-NEXT: v_mov_b32_e32 v6, s3
-; GFX10-NEXT: v_mov_b32_e32 v7, s2
-; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16
-; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1]
+; GFX10-NEXT: v_mov_b32_e32 v6, s1
+; GFX10-NEXT: v_mov_b32_e32 v7, s0
+; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[2:3] offset:16
+; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_add_v8i32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x44
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_i32 s2, s7, s15
-; GFX11-NEXT: s_add_i32 s3, s6, s14
+; GFX11-NEXT: s_add_i32 s0, s7, s15
+; GFX11-NEXT: s_add_i32 s1, s6, s14
; GFX11-NEXT: s_add_i32 s6, s11, s19
; GFX11-NEXT: s_add_i32 s7, s10, s18
; GFX11-NEXT: s_add_i32 s8, s8, s16
@@ -440,11 +441,11 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x
; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s9
; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s6
; GFX11-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s5
-; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s2
-; GFX11-NEXT: v_mov_b32_e32 v6, s3
+; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s0
+; GFX11-NEXT: v_mov_b32_e32 v6, s1
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
-; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1]
+; GFX11-NEXT: global_store_b128 v8, v[0:3], s[2:3] offset:16
+; GFX11-NEXT: global_store_b128 v8, v[4:7], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -453,10 +454,10 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b512 s[4:19], s[0:1], 0x44
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_add_co_i32 s2, s7, s15
-; GFX12-NEXT: s_add_co_i32 s3, s6, s14
+; GFX12-NEXT: s_add_co_i32 s0, s7, s15
+; GFX12-NEXT: s_add_co_i32 s1, s6, s14
; GFX12-NEXT: s_add_co_i32 s6, s11, s19
; GFX12-NEXT: s_add_co_i32 s7, s10, s18
; GFX12-NEXT: s_add_co_i32 s8, s8, s16
@@ -466,11 +467,11 @@ define amdgpu_kernel void @s_add_v8i32(ptr addrspace(1) %out, <8 x i32> %a, <8 x
; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s9
; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s6
; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s5
-; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s2
-; GFX12-NEXT: v_mov_b32_e32 v6, s3
+; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s0
+; GFX12-NEXT: v_mov_b32_e32 v6, s1
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[2:3] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -534,7 +535,7 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64
; GFX8-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0xa4
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_add_i32 s7, s7, s39
; GFX8-NEXT: s_add_i32 s6, s6, s38
@@ -548,43 +549,43 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1
; GFX8-NEXT: s_add_i32 s14, s14, s46
; GFX8-NEXT: s_add_i32 s13, s13, s45
; GFX8-NEXT: s_add_i32 s12, s12, s44
-; GFX8-NEXT: s_add_i32 s2, s19, s51
-; GFX8-NEXT: s_add_i32 s3, s18, s50
+; GFX8-NEXT: s_add_i32 s0, s19, s51
+; GFX8-NEXT: s_add_i32 s1, s18, s50
; GFX8-NEXT: s_add_i32 s17, s17, s49
; GFX8-NEXT: s_add_i32 s16, s16, s48
-; GFX8-NEXT: v_mov_b32_e32 v3, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 32
+; GFX8-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NEXT: s_add_u32 s0, s2, 48
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: s_add_u32 s0, s2, 32
; GFX8-NEXT: v_mov_b32_e32 v0, s16
; GFX8-NEXT: v_mov_b32_e32 v1, s17
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 16
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: s_add_u32 s0, s2, 16
; GFX8-NEXT: v_mov_b32_e32 v0, s12
; GFX8-NEXT: v_mov_b32_e32 v1, s13
; GFX8-NEXT: v_mov_b32_e32 v2, s14
; GFX8-NEXT: v_mov_b32_e32 v3, s15
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s3, 0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: v_mov_b32_e32 v1, s9
; GFX8-NEXT: v_mov_b32_e32 v2, s10
; GFX8-NEXT: v_mov_b32_e32 v3, s11
-; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: v_mov_b32_e32 v3, s7
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
;
@@ -592,11 +593,11 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64
; GFX9-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0xa4
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_i32 s2, s7, s39
-; GFX9-NEXT: s_add_i32 s3, s6, s38
+; GFX9-NEXT: s_add_i32 s0, s7, s39
+; GFX9-NEXT: s_add_i32 s1, s6, s38
; GFX9-NEXT: s_add_i32 s6, s11, s43
; GFX9-NEXT: s_add_i32 s7, s10, s42
; GFX9-NEXT: s_add_i32 s10, s15, s47
@@ -613,38 +614,38 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1
; GFX9-NEXT: v_mov_b32_e32 v3, s14
; GFX9-NEXT: s_add_i32 s9, s9, s41
; GFX9-NEXT: s_add_i32 s8, s8, s40
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:48
; GFX9-NEXT: s_add_i32 s5, s5, s37
; GFX9-NEXT: v_mov_b32_e32 v0, s12
; GFX9-NEXT: v_mov_b32_e32 v1, s13
; GFX9-NEXT: v_mov_b32_e32 v2, s11
; GFX9-NEXT: v_mov_b32_e32 v3, s10
; GFX9-NEXT: s_add_i32 s4, s4, s36
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:32
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_mov_b32_e32 v1, s9
; GFX9-NEXT: v_mov_b32_e32 v2, s7
; GFX9-NEXT: v_mov_b32_e32 v3, s6
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_mov_b32_e32 v2, s3
-; GFX9-NEXT: v_mov_b32_e32 v3, s2
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_add_v16i32:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64
; GFX10-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0xa4
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v16, 0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_i32 s2, s7, s39
-; GFX10-NEXT: s_add_i32 s3, s6, s38
+; GFX10-NEXT: s_add_i32 s0, s7, s39
+; GFX10-NEXT: s_add_i32 s1, s6, s38
; GFX10-NEXT: s_add_i32 s6, s11, s43
; GFX10-NEXT: s_add_i32 s7, s10, s42
; GFX10-NEXT: s_add_i32 s10, s15, s47
@@ -673,12 +674,12 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1
; GFX10-NEXT: v_mov_b32_e32 v11, s6
; GFX10-NEXT: v_mov_b32_e32 v12, s4
; GFX10-NEXT: v_mov_b32_e32 v13, s5
-; GFX10-NEXT: v_mov_b32_e32 v14, s3
-; GFX10-NEXT: v_mov_b32_e32 v15, s2
-; GFX10-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] offset:48
-; GFX10-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:32
-; GFX10-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:16
-; GFX10-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1]
+; GFX10-NEXT: v_mov_b32_e32 v14, s1
+; GFX10-NEXT: v_mov_b32_e32 v15, s0
+; GFX10-NEXT: global_store_dwordx4 v16, v[0:3], s[2:3] offset:48
+; GFX10-NEXT: global_store_dwordx4 v16, v[4:7], s[2:3] offset:32
+; GFX10-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3] offset:16
+; GFX10-NEXT: global_store_dwordx4 v16, v[12:15], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_add_v16i32:
@@ -686,10 +687,10 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x64
; GFX11-NEXT: s_load_b512 s[36:51], s[0:1], 0xa4
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_i32 s2, s7, s39
-; GFX11-NEXT: s_add_i32 s3, s6, s38
+; GFX11-NEXT: s_add_i32 s0, s7, s39
+; GFX11-NEXT: s_add_i32 s1, s6, s38
; GFX11-NEXT: s_add_i32 s6, s11, s43
; GFX11-NEXT: s_add_i32 s7, s10, s42
; GFX11-NEXT: s_add_i32 s10, s15, s47
@@ -711,13 +712,13 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1
; GFX11-NEXT: v_dual_mov_b32 v6, s11 :: v_dual_mov_b32 v9, s9
; GFX11-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v11, s6
; GFX11-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v13, s5
-; GFX11-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v15, s2
-; GFX11-NEXT: v_mov_b32_e32 v14, s3
+; GFX11-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v15, s0
+; GFX11-NEXT: v_mov_b32_e32 v14, s1
; GFX11-NEXT: s_clause 0x3
-; GFX11-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
-; GFX11-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32
-; GFX11-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
-; GFX11-NEXT: global_store_b128 v16, v[12:15], s[0:1]
+; GFX11-NEXT: global_store_b128 v16, v[0:3], s[2:3] offset:48
+; GFX11-NEXT: global_store_b128 v16, v[4:7], s[2:3] offset:32
+; GFX11-NEXT: global_store_b128 v16, v[8:11], s[2:3] offset:16
+; GFX11-NEXT: global_store_b128 v16, v[12:15], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -727,10 +728,10 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1
; GFX12-NEXT: s_clause 0x2
; GFX12-NEXT: s_load_b512 s[4:19], s[0:1], 0x64
; GFX12-NEXT: s_load_b512 s[36:51], s[0:1], 0xa4
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_add_co_i32 s2, s7, s39
-; GFX12-NEXT: s_add_co_i32 s3, s6, s38
+; GFX12-NEXT: s_add_co_i32 s0, s7, s39
+; GFX12-NEXT: s_add_co_i32 s1, s6, s38
; GFX12-NEXT: s_add_co_i32 s6, s11, s43
; GFX12-NEXT: s_add_co_i32 s7, s10, s42
; GFX12-NEXT: s_add_co_i32 s10, s15, s47
@@ -752,13 +753,13 @@ define amdgpu_kernel void @s_add_v16i32(ptr addrspace(1) %out, <16 x i32> %a, <1
; GFX12-NEXT: v_dual_mov_b32 v6, s11 :: v_dual_mov_b32 v9, s9
; GFX12-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v11, s6
; GFX12-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v13, s5
-; GFX12-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v15, s2
-; GFX12-NEXT: v_mov_b32_e32 v14, s3
+; GFX12-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v15, s0
+; GFX12-NEXT: v_mov_b32_e32 v14, s1
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1]
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[2:3] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[2:3] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[2:3] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -792,11 +793,11 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
;
; GFX8-LABEL: v_add_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v0
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -804,68 +805,68 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_load_dword v2, v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_add_i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc
+; GFX9-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_add_u32_e32 v1, v1, v2
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_add_i32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX10-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_add_i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] offset:4 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_nc_u32_e32 v0, v1, v0
-; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v2, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_add_i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:4 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_b32 v0, v0, s[6:7] offset:4 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_nc_u32_e32 v0, v1, v0
-; GFX12-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v2, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -901,66 +902,66 @@ define amdgpu_kernel void @v_add_imm_i32(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX8-LABEL: v_add_imm_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x7b, v2
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_add_imm_i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u32_e32 v0, 0x7b, v0
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_add_imm_i32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x7b, v0
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_add_imm_i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_nc_u32_e32 v0, 0x7b, v0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_add_imm_i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT: global_load_b32 v0, v0, s[6:7] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_nc_u32_e32 v0, 0x7b, v0
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -993,11 +994,11 @@ define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX8-LABEL: add64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: s_add_u32 s0, s6, s0
-; GFX8-NEXT: s_addc_u32 s1, s7, s1
+; GFX8-NEXT: s_add_u32 s0, s6, s2
+; GFX8-NEXT: s_addc_u32 s1, s7, s3
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mov_b32_e32 v2, s0
@@ -1035,10 +1036,10 @@ define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s0, s6, s0
-; GFX11-NEXT: s_addc_u32 s1, s7, s1
+; GFX11-NEXT: s_add_u32 s0, s6, s2
+; GFX11-NEXT: s_addc_u32 s1, s7, s3
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
@@ -1050,9 +1051,9 @@ define amdgpu_kernel void @add64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[2:3]
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
@@ -1090,15 +1091,15 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad
;
; GFX8-LABEL: add64_sgpr_vgpr:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_add_u32 s0, s2, s4
-; GFX8-NEXT: s_addc_u32 s1, s3, s5
+; GFX8-NEXT: s_add_u32 s0, s6, s0
+; GFX8-NEXT: s_addc_u32 s1, s7, s1
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
@@ -1138,16 +1139,16 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad
; GFX11-LABEL: add64_sgpr_vgpr:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s2, s2, s4
-; GFX11-NEXT: s_addc_u32 s3, s3, s5
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_add_u32 s0, s6, s0
+; GFX11-NEXT: s_addc_u32 s1, s7, s1
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1155,15 +1156,15 @@ define amdgpu_kernel void @add64_sgpr_vgpr(ptr addrspace(1) %out, i64 %a, ptr ad
; GFX12-LABEL: add64_sgpr_vgpr:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1]
; GFX12-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1205,123 +1206,123 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(
;
; GFX8-LABEL: add64_in_branch:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b64 s[8:9], 0
+; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b64 s[2:3], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX8-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX8-NEXT: s_cbranch_scc0 .LBB9_4
; GFX8-NEXT: ; %bb.1: ; %else
-; GFX8-NEXT: s_add_u32 s4, s4, s6
-; GFX8-NEXT: s_addc_u32 s5, s5, s7
-; GFX8-NEXT: s_andn2_b64 vcc, exec, s[8:9]
+; GFX8-NEXT: s_add_u32 s0, s8, s10
+; GFX8-NEXT: s_addc_u32 s1, s9, s11
+; GFX8-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX8-NEXT: s_cbranch_vccnz .LBB9_3
; GFX8-NEXT: .LBB9_2: ; %if
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX8-NEXT: .LBB9_3: ; %endif
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
; GFX8-NEXT: .LBB9_4:
-; GFX8-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX8-NEXT: ; implicit-def: $sgpr0_sgpr1
; GFX8-NEXT: s_branch .LBB9_2
;
; GFX9-LABEL: add64_in_branch:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[8:9], 0
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX9-NEXT: s_cbranch_scc0 .LBB9_4
; GFX9-NEXT: ; %bb.1: ; %else
-; GFX9-NEXT: s_add_u32 s4, s4, s6
-; GFX9-NEXT: s_addc_u32 s5, s5, s7
-; GFX9-NEXT: s_andn2_b64 vcc, exec, s[8:9]
+; GFX9-NEXT: s_add_u32 s0, s8, s10
+; GFX9-NEXT: s_addc_u32 s1, s9, s11
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX9-NEXT: s_cbranch_vccnz .LBB9_3
; GFX9-NEXT: .LBB9_2: ; %if
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX9-NEXT: .LBB9_3: ; %endif
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
; GFX9-NEXT: .LBB9_4:
-; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX9-NEXT: ; implicit-def: $sgpr0_sgpr1
; GFX9-NEXT: s_branch .LBB9_2
;
; GFX10-LABEL: add64_in_branch:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX10-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX10-NEXT: s_cbranch_scc0 .LBB9_4
; GFX10-NEXT: ; %bb.1: ; %else
-; GFX10-NEXT: s_add_u32 s4, s4, s6
-; GFX10-NEXT: s_addc_u32 s5, s5, s7
+; GFX10-NEXT: s_add_u32 s0, s8, s10
+; GFX10-NEXT: s_addc_u32 s1, s9, s11
; GFX10-NEXT: s_cbranch_execnz .LBB9_3
; GFX10-NEXT: .LBB9_2: ; %if
-; GFX10-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX10-NEXT: .LBB9_3: ; %endif
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-NEXT: v_mov_b32_e32 v1, s5
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
; GFX10-NEXT: .LBB9_4:
-; GFX10-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX10-NEXT: ; implicit-def: $sgpr0_sgpr1
; GFX10-NEXT: s_branch .LBB9_2
;
; GFX11-LABEL: add64_in_branch:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX11-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX11-NEXT: s_cbranch_scc0 .LBB9_4
; GFX11-NEXT: ; %bb.1: ; %else
-; GFX11-NEXT: s_add_u32 s4, s4, s6
-; GFX11-NEXT: s_addc_u32 s5, s5, s7
+; GFX11-NEXT: s_add_u32 s0, s8, s10
+; GFX11-NEXT: s_addc_u32 s1, s9, s11
; GFX11-NEXT: s_cbranch_execnz .LBB9_3
; GFX11-NEXT: .LBB9_2: ; %if
-; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX11-NEXT: .LBB9_3: ; %endif
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
; GFX11-NEXT: .LBB9_4:
-; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX11-NEXT: ; implicit-def: $sgpr0_sgpr1
; GFX11-NEXT: s_branch .LBB9_2
;
; GFX12-LABEL: add64_in_branch:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX12-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX12-NEXT: s_cbranch_scc0 .LBB9_4
; GFX12-NEXT: ; %bb.1: ; %else
-; GFX12-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[8:9], s[10:11]
; GFX12-NEXT: s_cbranch_execnz .LBB9_3
; GFX12-NEXT: .LBB9_2: ; %if
-; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: .LBB9_3: ; %endif
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
; GFX12-NEXT: .LBB9_4:
-; GFX12-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX12-NEXT: ; implicit-def: $sgpr0_sgpr1
; GFX12-NEXT: s_branch .LBB9_2
entry:
%0 = icmp eq i64 %a, 0
diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
index be9b5b00c39d7..65b8db96dfbd8 100644
--- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -10,14 +10,14 @@ define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace
; VI-LABEL: v_test_add_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -66,13 +66,13 @@ define amdgpu_kernel void @v_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
; GFX11-NEXT: global_store_b32 v2, v0, s[4:5]
@@ -94,19 +94,19 @@ define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace
; VI-LABEL: s_test_add_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[6:7], 0x0
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_load_dword s1, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s1, s2, 16
-; VI-NEXT: s_lshr_b32 s3, s0, 16
-; VI-NEXT: s_add_i32 s2, s2, s0
-; VI-NEXT: s_add_i32 s1, s1, s3
-; VI-NEXT: s_and_b32 s0, s2, 0xffff
-; VI-NEXT: s_lshl_b32 s1, s1, 16
+; VI-NEXT: s_lshr_b32 s2, s0, 16
+; VI-NEXT: s_lshr_b32 s3, s1, 16
+; VI-NEXT: s_add_i32 s0, s0, s1
+; VI-NEXT: s_add_i32 s2, s2, s3
+; VI-NEXT: s_and_b32 s0, s0, 0xffff
+; VI-NEXT: s_lshl_b32 s1, s2, 16
; VI-NEXT: s_or_b32 s0, s0, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -144,13 +144,13 @@ define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[6:7], 0x0
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_add_u16 v1, s2, s0
+; GFX11-NEXT: v_pk_add_u16 v1, s0, s1
; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -165,54 +165,54 @@ define amdgpu_kernel void @s_test_add_v2i16(ptr addrspace(1) %out, ptr addrspace
define amdgpu_kernel void @s_test_add_self_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0) #1 {
; VI-LABEL: s_test_add_self_v2i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s0, s2, 16
-; VI-NEXT: s_and_b32 s1, s2, 0xffff
-; VI-NEXT: s_add_i32 s1, s1, s1
+; VI-NEXT: s_lshr_b32 s1, s0, 16
+; VI-NEXT: s_and_b32 s0, s0, 0xffff
; VI-NEXT: s_add_i32 s0, s0, s0
-; VI-NEXT: s_lshl_b32 s0, s0, 16
-; VI-NEXT: s_and_b32 s1, s1, 0xffff
-; VI-NEXT: s_or_b32 s0, s1, s0
+; VI-NEXT: s_add_i32 s1, s1, s1
+; VI-NEXT: s_lshl_b32 s1, s1, 16
+; VI-NEXT: s_and_b32 s0, s0, 0xffff
+; VI-NEXT: s_or_b32 s0, s0, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_test_add_self_v2i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_pk_add_u16 v1, s2, s2
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: v_pk_add_u16 v1, s0, s0
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_test_add_self_v2i16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_pk_add_u16 v1, s2, s2
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: v_pk_add_u16 v1, s0, s0
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_add_self_v2i16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_add_u16 v1, s2, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_pk_add_u16 v1, s0, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -279,17 +279,17 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(ptr addrspace(1) %out, <2 x
define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
; VI-LABEL: v_test_add_v2i16_constant:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: v_mov_b32_e32 v3, 0x1c8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v2, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u16_e32 v4, 0x7b, v2
; VI-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, v4, v2
@@ -298,38 +298,38 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_test_add_v2i16_constant:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_mov_b32 s0, 0x1c8007b
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s2, 0x1c8007b
-; GFX9-NEXT: v_pk_add_u16 v0, v0, s2
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: v_pk_add_u16 v0, v0, s0
+; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_add_v2i16_constant:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_u16 v0, 0x1c8007b, v0
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_add_v2i16_constant:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v0, 0x1c8007b, v0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -346,17 +346,17 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(ptr addrspace(1) %out, ptr
define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
; VI-LABEL: v_test_add_v2i16_neg_constant:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: v_mov_b32_e32 v3, 0xfffffc21
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v2, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u16_e32 v4, 0xfcb3, v2
; VI-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, v4, v2
@@ -365,38 +365,38 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_test_add_v2i16_neg_constant:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_mov_b32 s0, 0xfc21fcb3
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s2, 0xfc21fcb3
-; GFX9-NEXT: v_pk_add_u16 v0, v0, s2
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: v_pk_add_u16 v0, v0, s0
+; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_add_v2i16_neg_constant:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_u16 v0, 0xfc21fcb3, v0
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_add_v2i16_neg_constant:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v0, 0xfc21fcb3, v0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -412,17 +412,17 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(ptr addrspace(1) %out,
define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
; VI-LABEL: v_test_add_v2i16_inline_neg1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: v_mov_b32_e32 v3, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v2, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u16_e32 v4, -1, v2
; VI-NEXT: v_add_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, v4, v2
@@ -431,37 +431,37 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p
;
; GFX9-LABEL: v_test_add_v2i16_inline_neg1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_u16 v0, v0, -1
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_add_v2i16_inline_neg1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_u16 v0, v0, -1
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_add_v2i16_inline_neg1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v0, v0, -1
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -477,16 +477,16 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(ptr addrspace(1) %out, p
define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
; VI-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v2, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; VI-NEXT: v_add_u16_e32 v2, 32, v2
; VI-NEXT: v_or_b32_e32 v2, v2, v3
@@ -495,37 +495,37 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) %
;
; GFX9-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_u16 v0, v0, 32
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_u16 v0, v0, 32
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_add_v2i16_inline_lo_zero_hi:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v0, v0, 32
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -542,17 +542,17 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(ptr addrspace(1) %
define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
; VI-LABEL: v_test_add_v2i16_inline_fp_split:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: v_mov_b32_e32 v3, 0x3f80
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v2, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u16_sdwa v3, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -560,37 +560,37 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(ptr addrspace(1) %ou
;
; GFX9-LABEL: v_test_add_v2i16_inline_fp_split:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_u16 v0, v0, 1.0
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_add_v2i16_inline_fp_split:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_u16 v0, v0, 1.0
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_add_v2i16_inline_fp_split:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v0, v0, 1.0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -608,14 +608,14 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out,
; VI-LABEL: v_test_add_v2i16_zext_to_v2i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -667,13 +667,13 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(ptr addrspace(1) %out,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -700,14 +700,14 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
; VI-LABEL: v_test_add_v2i16_zext_to_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v6, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -763,12 +763,12 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
@@ -796,14 +796,14 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out,
; VI-LABEL: v_test_add_v2i16_sext_to_v2i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -857,13 +857,13 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(ptr addrspace(1) %out,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -890,14 +890,14 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out,
; VI-LABEL: v_test_add_v2i16_sext_to_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[2:3]
@@ -957,13 +957,13 @@ define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(ptr addrspace(1) %out,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v0, v1, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
diff --git a/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll b/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll
index 330cf48803680..46379dada77f2 100644
--- a/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll
+++ b/llvm/test/CodeGen/AMDGPU/amd.endpgm.ll
@@ -46,11 +46,11 @@ define amdgpu_kernel void @test2(ptr %p, i32 %x) {
; GFX9-NEXT: s_cmp_lt_i32 s2, 1
; GFX9-NEXT: s_cbranch_scc0 .LBB2_2
; GFX9-NEXT: ; %bb.1: ; %else
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: flat_store_dword v[0:1], v2
; GFX9-NEXT: s_endpgm
; GFX9-NEXT: .LBB2_2: ; %then
@@ -63,11 +63,11 @@ define amdgpu_kernel void @test2(ptr %p, i32 %x) {
; GFX10-NEXT: s_cmp_lt_i32 s2, 1
; GFX10-NEXT: s_cbranch_scc0 .LBB2_2
; GFX10-NEXT: ; %bb.1: ; %else
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, s2
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: v_mov_b32_e32 v0, s4
+; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: flat_store_dword v[0:1], v2
; GFX10-NEXT: s_endpgm
; GFX10-NEXT: .LBB2_2: ; %then
@@ -80,10 +80,10 @@ define amdgpu_kernel void @test2(ptr %p, i32 %x) {
; GFX11-NEXT: s_cmp_lt_i32 s2, 1
; GFX11-NEXT: s_cbranch_scc0 .LBB2_2
; GFX11-NEXT: ; %bb.1: ; %else
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, s2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: flat_store_b32 v[0:1], v2
; GFX11-NEXT: s_endpgm
; GFX11-NEXT: .LBB2_2: ; %then
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
index 77976e470fc78..95f59479c73e8 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
@@ -72,12 +72,12 @@ define amdgpu_ps void @test_sgpr_plus_imm_offset(ptr addrspace(4) inreg %base, i
; SDAG-DAG: %[[BASE1:.*]]:sgpr_32 = COPY $sgpr1
; SDAG-DAG: %[[OFFSET:.*]]:sgpr_32 = COPY $sgpr2
; SDAG-DAG: %[[BASE:.*]]:sgpr_64 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1
-; SDAG: S_LOAD_DWORDX2_SGPR_IMM killed %[[BASE]], %[[OFFSET]], 16,
+; SDAG: S_LOAD_DWORDX2_SGPR_IMM_ec killed %[[BASE]], %[[OFFSET]], 16,
; GISEL-DAG: %[[BASE0:.*]]:sreg_32 = COPY $sgpr0
; GISEL-DAG: %[[BASE1:.*]]:sreg_32 = COPY $sgpr1
; GISEL-DAG: %[[OFFSET:.*]]:sreg_32 = COPY $sgpr2
; GISEL-DAG: %[[BASE:.*]]:sreg_64 = REG_SEQUENCE %[[BASE0]], %subreg.sub0, %[[BASE1]], %subreg.sub1
-; GISEL: S_LOAD_DWORDX2_SGPR_IMM %[[BASE]], %[[OFFSET]], 16,
+; GISEL: S_LOAD_DWORDX2_SGPR_IMM_ec %[[BASE]], %[[OFFSET]], 16,
define amdgpu_ps void @test_sgpr_plus_imm_offset_x2(ptr addrspace(4) inreg %base, i32 inreg %offset,
ptr addrspace(1) inreg %out) {
%v1 = getelementptr i8, ptr addrspace(4) %base, i64 16
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 559871d162e13..e45aceec8acc9 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -475,12 +475,12 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v3, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshr_b32 s3, s2, 16
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
; GFX9-NEXT: s_and_b32 s2, s2, 0xffff
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0
; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2
; GFX9-NEXT: v_trunc_f32_e32 v2, v2
@@ -488,7 +488,8 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
-; GFX9-NEXT: global_store_short v3, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_short v3, v0, s[2:3]
; GFX9-NEXT: s_endpgm
%r = udiv i16 %x, %y
store i16 %r, ptr addrspace(1) %out
@@ -544,13 +545,13 @@ define amdgpu_kernel void @urem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
;
; GFX9-LABEL: urem_i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshr_b32 s3, s2, 16
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT: s_and_b32 s4, s2, 0xffff
-; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s4
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_lshr_b32 s5, s4, 16
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5
+; GFX9-NEXT: s_and_b32 s2, s4, 0xffff
+; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0
; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2
; GFX9-NEXT: v_trunc_f32_e32 v2, v2
@@ -559,10 +560,10 @@ define amdgpu_kernel void @urem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
-; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
+; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5
+; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v1, v0, s[0:1]
+; GFX9-NEXT: global_store_short v1, v0, s[2:3]
; GFX9-NEXT: s_endpgm
%r = urem i16 %x, %y
store i16 %r, ptr addrspace(1) %out
@@ -709,29 +710,28 @@ define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
; GFX9-LABEL: srem_i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_ashr_i32 s5, s4, 16
; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s5
-; GFX9-NEXT: s_sext_i32_i16 s2, s4
-; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s2
-; GFX9-NEXT: s_xor_b32 s2, s2, s5
+; GFX9-NEXT: s_sext_i32_i16 s0, s4
+; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s0
+; GFX9-NEXT: s_xor_b32 s0, s0, s5
; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0
-; GFX9-NEXT: s_ashr_i32 s2, s2, 30
-; GFX9-NEXT: s_or_b32 s6, s2, 1
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_ashr_i32 s0, s0, 30
+; GFX9-NEXT: s_or_b32 s6, s0, 1
; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2
; GFX9-NEXT: v_trunc_f32_e32 v2, v2
; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1
; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2
-; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
-; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec
-; GFX9-NEXT: s_cselect_b32 s2, s6, 0
-; GFX9-NEXT: v_add_u32_e32 v0, s2, v2
+; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
+; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT: s_cselect_b32 s0, s6, 0
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v2
; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v1, v0, s[0:1]
+; GFX9-NEXT: global_store_short v1, v0, s[2:3]
; GFX9-NEXT: s_endpgm
%r = srem i16 %x, %y
store i16 %r, ptr addrspace(1) %out
@@ -781,20 +781,20 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
;
; GFX9-LABEL: udiv_i8:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s2
+; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s4
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s2
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v3, s4
; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1
; GFX9-NEXT: v_trunc_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v1
; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v3
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
-; GFX9-NEXT: global_store_byte v2, v0, s[0:1]
+; GFX9-NEXT: global_store_byte v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
%r = udiv i8 %x, %y
store i8 %r, ptr addrspace(1) %out
@@ -849,13 +849,13 @@ define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
;
; GFX9-LABEL: urem_i8:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s2
+; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v0, s4
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s2
-; GFX9-NEXT: s_lshr_b32 s3, s2, 8
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s4
+; GFX9-NEXT: s_lshr_b32 s0, s4, 8
; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1
; GFX9-NEXT: v_trunc_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v1
@@ -863,10 +863,9 @@ define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
-; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_byte v1, v0, s[0:1]
+; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0
+; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0
+; GFX9-NEXT: global_store_byte v1, v0, s[2:3]
; GFX9-NEXT: s_endpgm
%r = urem i8 %x, %y
store i8 %r, ptr addrspace(1) %out
@@ -1277,12 +1276,12 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX9-LABEL: udiv_v4i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9
-; GFX9-NEXT: s_sub_i32 s2, 0, s8
+; GFX9-NEXT: s_sub_i32 s0, 0, s8
; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s10
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
@@ -1290,40 +1289,40 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT: v_readfirstlane_b32 s3, v0
-; GFX9-NEXT: s_mul_i32 s2, s2, s3
-; GFX9-NEXT: s_mul_hi_u32 s2, s3, s2
-; GFX9-NEXT: s_add_i32 s3, s3, s2
-; GFX9-NEXT: s_mul_hi_u32 s2, s4, s3
-; GFX9-NEXT: s_mul_i32 s3, s2, s8
-; GFX9-NEXT: s_sub_i32 s3, s4, s3
-; GFX9-NEXT: s_add_i32 s13, s2, 1
-; GFX9-NEXT: s_sub_i32 s4, s3, s8
-; GFX9-NEXT: s_cmp_ge_u32 s3, s8
-; GFX9-NEXT: s_cselect_b32 s2, s13, s2
-; GFX9-NEXT: s_cselect_b32 s3, s4, s3
-; GFX9-NEXT: s_add_i32 s4, s2, 1
-; GFX9-NEXT: s_cmp_ge_u32 s3, s8
+; GFX9-NEXT: v_readfirstlane_b32 s1, v0
+; GFX9-NEXT: s_mul_i32 s0, s0, s1
+; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0
+; GFX9-NEXT: s_add_i32 s1, s1, s0
+; GFX9-NEXT: s_mul_hi_u32 s0, s4, s1
+; GFX9-NEXT: s_mul_i32 s1, s0, s8
+; GFX9-NEXT: s_sub_i32 s1, s4, s1
+; GFX9-NEXT: s_add_i32 s13, s0, 1
+; GFX9-NEXT: s_sub_i32 s4, s1, s8
+; GFX9-NEXT: s_cmp_ge_u32 s1, s8
+; GFX9-NEXT: s_cselect_b32 s0, s13, s0
+; GFX9-NEXT: s_cselect_b32 s1, s4, s1
+; GFX9-NEXT: s_add_i32 s4, s0, 1
+; GFX9-NEXT: s_cmp_ge_u32 s1, s8
; GFX9-NEXT: v_readfirstlane_b32 s12, v1
-; GFX9-NEXT: s_cselect_b32 s2, s4, s2
-; GFX9-NEXT: s_sub_i32 s3, 0, s9
-; GFX9-NEXT: s_mul_i32 s3, s3, s12
-; GFX9-NEXT: s_mul_hi_u32 s3, s12, s3
-; GFX9-NEXT: s_add_i32 s12, s12, s3
+; GFX9-NEXT: s_cselect_b32 s0, s4, s0
+; GFX9-NEXT: s_sub_i32 s1, 0, s9
+; GFX9-NEXT: s_mul_i32 s1, s1, s12
+; GFX9-NEXT: s_mul_hi_u32 s1, s12, s1
+; GFX9-NEXT: s_add_i32 s12, s12, s1
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v2
-; GFX9-NEXT: s_mul_hi_u32 s3, s5, s12
-; GFX9-NEXT: s_mul_i32 s4, s3, s9
+; GFX9-NEXT: s_mul_hi_u32 s1, s5, s12
+; GFX9-NEXT: s_mul_i32 s4, s1, s9
; GFX9-NEXT: s_sub_i32 s4, s5, s4
-; GFX9-NEXT: s_add_i32 s8, s3, 1
+; GFX9-NEXT: s_add_i32 s8, s1, 1
; GFX9-NEXT: s_sub_i32 s5, s4, s9
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: s_cmp_ge_u32 s4, s9
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: s_cselect_b32 s3, s8, s3
+; GFX9-NEXT: s_cselect_b32 s1, s8, s1
; GFX9-NEXT: s_cselect_b32 s4, s5, s4
-; GFX9-NEXT: s_add_i32 s5, s3, 1
+; GFX9-NEXT: s_add_i32 s5, s1, 1
; GFX9-NEXT: s_cmp_ge_u32 s4, s9
-; GFX9-NEXT: s_cselect_b32 s3, s5, s3
+; GFX9-NEXT: s_cselect_b32 s1, s5, s1
; GFX9-NEXT: v_readfirstlane_b32 s5, v0
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s11
; GFX9-NEXT: s_sub_i32 s4, 0, s10
@@ -1360,11 +1359,11 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX9-NEXT: s_add_i32 s7, s5, 1
; GFX9-NEXT: s_cmp_ge_u32 s6, s11
; GFX9-NEXT: s_cselect_b32 s5, s7, s5
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
%r = udiv <4 x i32> %x, %y
store <4 x i32> %r, ptr addrspace(1) %out
@@ -1585,12 +1584,12 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX9-LABEL: urem_v4i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s9
-; GFX9-NEXT: s_sub_i32 s2, 0, s8
+; GFX9-NEXT: s_sub_i32 s0, 0, s8
; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s10
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
@@ -1600,35 +1599,35 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; GFX9-NEXT: v_readfirstlane_b32 s3, v0
-; GFX9-NEXT: s_mul_i32 s2, s2, s3
-; GFX9-NEXT: s_mul_hi_u32 s2, s3, s2
-; GFX9-NEXT: s_add_i32 s3, s3, s2
-; GFX9-NEXT: s_mul_hi_u32 s2, s4, s3
-; GFX9-NEXT: s_mul_i32 s2, s2, s8
-; GFX9-NEXT: s_sub_i32 s2, s4, s2
-; GFX9-NEXT: s_sub_i32 s3, s2, s8
-; GFX9-NEXT: s_cmp_ge_u32 s2, s8
-; GFX9-NEXT: s_cselect_b32 s2, s3, s2
-; GFX9-NEXT: s_sub_i32 s3, s2, s8
-; GFX9-NEXT: s_cmp_ge_u32 s2, s8
+; GFX9-NEXT: v_readfirstlane_b32 s1, v0
+; GFX9-NEXT: s_mul_i32 s0, s0, s1
+; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0
+; GFX9-NEXT: s_add_i32 s1, s1, s0
+; GFX9-NEXT: s_mul_hi_u32 s0, s4, s1
+; GFX9-NEXT: s_mul_i32 s0, s0, s8
+; GFX9-NEXT: s_sub_i32 s0, s4, s0
+; GFX9-NEXT: s_sub_i32 s1, s0, s8
+; GFX9-NEXT: s_cmp_ge_u32 s0, s8
+; GFX9-NEXT: s_cselect_b32 s0, s1, s0
+; GFX9-NEXT: s_sub_i32 s1, s0, s8
+; GFX9-NEXT: s_cmp_ge_u32 s0, s8
; GFX9-NEXT: v_readfirstlane_b32 s12, v1
-; GFX9-NEXT: s_cselect_b32 s2, s3, s2
-; GFX9-NEXT: s_sub_i32 s3, 0, s9
-; GFX9-NEXT: s_mul_i32 s3, s3, s12
-; GFX9-NEXT: s_mul_hi_u32 s3, s12, s3
-; GFX9-NEXT: s_add_i32 s12, s12, s3
-; GFX9-NEXT: s_mul_hi_u32 s3, s5, s12
-; GFX9-NEXT: s_mul_i32 s3, s3, s9
-; GFX9-NEXT: s_sub_i32 s3, s5, s3
-; GFX9-NEXT: s_sub_i32 s4, s3, s9
+; GFX9-NEXT: s_cselect_b32 s0, s1, s0
+; GFX9-NEXT: s_sub_i32 s1, 0, s9
+; GFX9-NEXT: s_mul_i32 s1, s1, s12
+; GFX9-NEXT: s_mul_hi_u32 s1, s12, s1
+; GFX9-NEXT: s_add_i32 s12, s12, s1
+; GFX9-NEXT: s_mul_hi_u32 s1, s5, s12
+; GFX9-NEXT: s_mul_i32 s1, s1, s9
+; GFX9-NEXT: s_sub_i32 s1, s5, s1
+; GFX9-NEXT: s_sub_i32 s4, s1, s9
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT: s_cmp_ge_u32 s3, s9
-; GFX9-NEXT: s_cselect_b32 s3, s4, s3
-; GFX9-NEXT: s_sub_i32 s4, s3, s9
-; GFX9-NEXT: s_cmp_ge_u32 s3, s9
+; GFX9-NEXT: s_cmp_ge_u32 s1, s9
+; GFX9-NEXT: s_cselect_b32 s1, s4, s1
+; GFX9-NEXT: s_sub_i32 s4, s1, s9
+; GFX9-NEXT: s_cmp_ge_u32 s1, s9
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s11
-; GFX9-NEXT: s_cselect_b32 s3, s4, s3
+; GFX9-NEXT: s_cselect_b32 s1, s4, s1
; GFX9-NEXT: s_sub_i32 s4, 0, s10
; GFX9-NEXT: v_readfirstlane_b32 s5, v2
; GFX9-NEXT: s_mul_i32 s4, s4, s5
@@ -1660,11 +1659,11 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX9-NEXT: s_sub_i32 s6, s5, s11
; GFX9-NEXT: s_cmp_ge_u32 s5, s11
; GFX9-NEXT: s_cselect_b32 s5, s6, s5
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
%r = urem <4 x i32> %x, %y
store <4 x i32> %r, ptr addrspace(1) %out
@@ -1966,7 +1965,6 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_abs_i32 s2, s8
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
@@ -1998,85 +1996,87 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX9-NEXT: s_xor_b32 s8, s5, s9
; GFX9-NEXT: s_sub_i32 s9, 0, s4
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_sub_i32 s2, s2, s3
+; GFX9-NEXT: s_sub_i32 s12, s2, s3
; GFX9-NEXT: s_abs_i32 s5, s5
; GFX9-NEXT: s_ashr_i32 s8, s8, 31
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s3, v0
-; GFX9-NEXT: s_mul_i32 s9, s9, s3
-; GFX9-NEXT: s_mul_hi_u32 s9, s3, s9
-; GFX9-NEXT: s_add_i32 s3, s3, s9
-; GFX9-NEXT: s_mul_hi_u32 s3, s5, s3
-; GFX9-NEXT: s_mul_i32 s9, s3, s4
-; GFX9-NEXT: s_sub_i32 s5, s5, s9
-; GFX9-NEXT: s_add_i32 s12, s3, 1
-; GFX9-NEXT: s_sub_i32 s9, s5, s4
-; GFX9-NEXT: s_cmp_ge_u32 s5, s4
-; GFX9-NEXT: s_cselect_b32 s3, s12, s3
-; GFX9-NEXT: s_cselect_b32 s5, s9, s5
-; GFX9-NEXT: s_add_i32 s9, s3, 1
-; GFX9-NEXT: s_cmp_ge_u32 s5, s4
-; GFX9-NEXT: s_cselect_b32 s3, s9, s3
-; GFX9-NEXT: s_abs_i32 s4, s10
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4
-; GFX9-NEXT: s_xor_b32 s3, s3, s8
-; GFX9-NEXT: s_sub_i32 s9, 0, s4
-; GFX9-NEXT: s_sub_i32 s3, s3, s8
+; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: s_mul_i32 s9, s9, s2
+; GFX9-NEXT: s_mul_hi_u32 s3, s2, s9
+; GFX9-NEXT: s_add_i32 s2, s2, s3
+; GFX9-NEXT: s_mul_hi_u32 s2, s5, s2
+; GFX9-NEXT: s_mul_i32 s3, s2, s4
+; GFX9-NEXT: s_sub_i32 s3, s5, s3
+; GFX9-NEXT: s_add_i32 s9, s2, 1
+; GFX9-NEXT: s_sub_i32 s5, s3, s4
+; GFX9-NEXT: s_cmp_ge_u32 s3, s4
+; GFX9-NEXT: s_cselect_b32 s2, s9, s2
+; GFX9-NEXT: s_cselect_b32 s3, s5, s3
+; GFX9-NEXT: s_add_i32 s5, s2, 1
+; GFX9-NEXT: s_cmp_ge_u32 s3, s4
+; GFX9-NEXT: s_cselect_b32 s2, s5, s2
+; GFX9-NEXT: s_abs_i32 s3, s10
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
+; GFX9-NEXT: s_xor_b32 s2, s2, s8
+; GFX9-NEXT: s_xor_b32 s4, s6, s10
+; GFX9-NEXT: s_abs_i32 s5, s6
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_xor_b32 s5, s6, s10
-; GFX9-NEXT: s_abs_i32 s6, s6
-; GFX9-NEXT: s_ashr_i32 s5, s5, 31
+; GFX9-NEXT: s_sub_i32 s6, 0, s3
+; GFX9-NEXT: s_sub_i32 s8, s2, s8
+; GFX9-NEXT: s_ashr_i32 s4, s4, 31
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_readfirstlane_b32 s8, v0
-; GFX9-NEXT: s_mul_i32 s9, s9, s8
-; GFX9-NEXT: s_mul_hi_u32 s9, s8, s9
-; GFX9-NEXT: s_add_i32 s8, s8, s9
-; GFX9-NEXT: s_mul_hi_u32 s8, s6, s8
-; GFX9-NEXT: s_mul_i32 s9, s8, s4
-; GFX9-NEXT: s_sub_i32 s6, s6, s9
-; GFX9-NEXT: s_add_i32 s10, s8, 1
-; GFX9-NEXT: s_sub_i32 s9, s6, s4
-; GFX9-NEXT: s_cmp_ge_u32 s6, s4
-; GFX9-NEXT: s_cselect_b32 s8, s10, s8
-; GFX9-NEXT: s_cselect_b32 s6, s9, s6
-; GFX9-NEXT: s_add_i32 s9, s8, 1
-; GFX9-NEXT: s_cmp_ge_u32 s6, s4
-; GFX9-NEXT: s_cselect_b32 s4, s9, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s8
+; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: s_mul_i32 s6, s6, s2
+; GFX9-NEXT: s_mul_hi_u32 s6, s2, s6
+; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: s_mul_hi_u32 s2, s5, s2
+; GFX9-NEXT: s_mul_i32 s6, s2, s3
+; GFX9-NEXT: s_sub_i32 s5, s5, s6
+; GFX9-NEXT: s_add_i32 s9, s2, 1
+; GFX9-NEXT: s_sub_i32 s6, s5, s3
+; GFX9-NEXT: s_cmp_ge_u32 s5, s3
+; GFX9-NEXT: s_cselect_b32 s2, s9, s2
+; GFX9-NEXT: s_cselect_b32 s5, s6, s5
+; GFX9-NEXT: s_add_i32 s6, s2, 1
+; GFX9-NEXT: s_cmp_ge_u32 s5, s3
+; GFX9-NEXT: s_cselect_b32 s5, s6, s2
; GFX9-NEXT: s_abs_i32 s6, s11
; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s6
-; GFX9-NEXT: s_xor_b32 s4, s4, s5
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: s_xor_b32 s2, s7, s11
+; GFX9-NEXT: s_xor_b32 s5, s5, s4
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_xor_b32 s0, s7, s11
; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v2
-; GFX9-NEXT: s_abs_i32 s3, s7
+; GFX9-NEXT: s_abs_i32 s1, s7
; GFX9-NEXT: s_sub_i32 s7, 0, s6
-; GFX9-NEXT: s_sub_i32 s4, s4, s5
+; GFX9-NEXT: s_sub_i32 s4, s5, s4
; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT: s_ashr_i32 s2, s2, 31
+; GFX9-NEXT: s_ashr_i32 s0, s0, 31
+; GFX9-NEXT: v_mov_b32_e32 v0, s12
; GFX9-NEXT: v_readfirstlane_b32 s5, v2
; GFX9-NEXT: s_mul_i32 s7, s7, s5
; GFX9-NEXT: s_mul_hi_u32 s7, s5, s7
; GFX9-NEXT: s_add_i32 s5, s5, s7
-; GFX9-NEXT: s_mul_hi_u32 s5, s3, s5
+; GFX9-NEXT: s_mul_hi_u32 s5, s1, s5
; GFX9-NEXT: s_mul_i32 s7, s5, s6
-; GFX9-NEXT: s_sub_i32 s3, s3, s7
+; GFX9-NEXT: s_sub_i32 s1, s1, s7
; GFX9-NEXT: s_add_i32 s8, s5, 1
-; GFX9-NEXT: s_sub_i32 s7, s3, s6
-; GFX9-NEXT: s_cmp_ge_u32 s3, s6
+; GFX9-NEXT: s_sub_i32 s7, s1, s6
+; GFX9-NEXT: s_cmp_ge_u32 s1, s6
; GFX9-NEXT: s_cselect_b32 s5, s8, s5
-; GFX9-NEXT: s_cselect_b32 s3, s7, s3
+; GFX9-NEXT: s_cselect_b32 s1, s7, s1
; GFX9-NEXT: s_add_i32 s7, s5, 1
-; GFX9-NEXT: s_cmp_ge_u32 s3, s6
-; GFX9-NEXT: s_cselect_b32 s3, s7, s5
-; GFX9-NEXT: s_xor_b32 s3, s3, s2
-; GFX9-NEXT: s_sub_i32 s2, s3, s2
+; GFX9-NEXT: s_cmp_ge_u32 s1, s6
+; GFX9-NEXT: s_cselect_b32 s1, s7, s5
+; GFX9-NEXT: s_xor_b32 s1, s1, s0
+; GFX9-NEXT: s_sub_i32 s0, s1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: v_mov_b32_e32 v3, s2
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
%r = sdiv <4 x i32> %x, %y
store <4 x i32> %r, ptr addrspace(1) %out
@@ -2350,7 +2350,6 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_abs_i32 s2, s8
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
@@ -2377,78 +2376,80 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4
; GFX9-NEXT: s_xor_b32 s2, s2, s3
; GFX9-NEXT: s_sub_i32 s9, 0, s4
-; GFX9-NEXT: s_sub_i32 s2, s2, s3
+; GFX9-NEXT: s_sub_i32 s12, s2, s3
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: s_ashr_i32 s8, s5, 31
; GFX9-NEXT: s_abs_i32 s5, s5
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s3, v0
-; GFX9-NEXT: s_mul_i32 s9, s9, s3
-; GFX9-NEXT: s_mul_hi_u32 s9, s3, s9
-; GFX9-NEXT: s_add_i32 s3, s3, s9
-; GFX9-NEXT: s_mul_hi_u32 s3, s5, s3
-; GFX9-NEXT: s_mul_i32 s3, s3, s4
-; GFX9-NEXT: s_sub_i32 s3, s5, s3
-; GFX9-NEXT: s_sub_i32 s5, s3, s4
-; GFX9-NEXT: s_cmp_ge_u32 s3, s4
-; GFX9-NEXT: s_cselect_b32 s3, s5, s3
-; GFX9-NEXT: s_sub_i32 s5, s3, s4
-; GFX9-NEXT: s_cmp_ge_u32 s3, s4
-; GFX9-NEXT: s_cselect_b32 s3, s5, s3
-; GFX9-NEXT: s_abs_i32 s4, s10
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4
-; GFX9-NEXT: s_xor_b32 s3, s3, s8
-; GFX9-NEXT: s_sub_i32 s9, 0, s4
-; GFX9-NEXT: s_sub_i32 s3, s3, s8
+; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: s_mul_i32 s9, s9, s2
+; GFX9-NEXT: s_mul_hi_u32 s3, s2, s9
+; GFX9-NEXT: s_add_i32 s2, s2, s3
+; GFX9-NEXT: s_mul_hi_u32 s2, s5, s2
+; GFX9-NEXT: s_mul_i32 s2, s2, s4
+; GFX9-NEXT: s_sub_i32 s2, s5, s2
+; GFX9-NEXT: s_sub_i32 s3, s2, s4
+; GFX9-NEXT: s_cmp_ge_u32 s2, s4
+; GFX9-NEXT: s_cselect_b32 s2, s3, s2
+; GFX9-NEXT: s_sub_i32 s3, s2, s4
+; GFX9-NEXT: s_cmp_ge_u32 s2, s4
+; GFX9-NEXT: s_cselect_b32 s2, s3, s2
+; GFX9-NEXT: s_abs_i32 s3, s10
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
+; GFX9-NEXT: s_xor_b32 s2, s2, s8
+; GFX9-NEXT: s_ashr_i32 s4, s6, 31
+; GFX9-NEXT: s_abs_i32 s5, s6
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_ashr_i32 s5, s6, 31
-; GFX9-NEXT: s_abs_i32 s6, s6
+; GFX9-NEXT: s_sub_i32 s6, 0, s3
+; GFX9-NEXT: s_sub_i32 s8, s2, s8
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s8, v0
-; GFX9-NEXT: s_mul_i32 s9, s9, s8
-; GFX9-NEXT: s_mul_hi_u32 s9, s8, s9
-; GFX9-NEXT: s_add_i32 s8, s8, s9
-; GFX9-NEXT: s_mul_hi_u32 s8, s6, s8
-; GFX9-NEXT: s_mul_i32 s8, s8, s4
-; GFX9-NEXT: s_sub_i32 s6, s6, s8
-; GFX9-NEXT: s_sub_i32 s8, s6, s4
-; GFX9-NEXT: s_cmp_ge_u32 s6, s4
-; GFX9-NEXT: s_cselect_b32 s6, s8, s6
-; GFX9-NEXT: s_sub_i32 s8, s6, s4
-; GFX9-NEXT: s_cmp_ge_u32 s6, s4
-; GFX9-NEXT: s_cselect_b32 s4, s8, s6
+; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: s_mul_i32 s6, s6, s2
+; GFX9-NEXT: s_mul_hi_u32 s6, s2, s6
+; GFX9-NEXT: s_add_i32 s2, s2, s6
+; GFX9-NEXT: s_mul_hi_u32 s2, s5, s2
+; GFX9-NEXT: s_mul_i32 s2, s2, s3
+; GFX9-NEXT: s_sub_i32 s2, s5, s2
+; GFX9-NEXT: s_sub_i32 s5, s2, s3
+; GFX9-NEXT: s_cmp_ge_u32 s2, s3
+; GFX9-NEXT: s_cselect_b32 s2, s5, s2
+; GFX9-NEXT: s_sub_i32 s5, s2, s3
+; GFX9-NEXT: s_cmp_ge_u32 s2, s3
+; GFX9-NEXT: s_cselect_b32 s5, s5, s2
; GFX9-NEXT: s_abs_i32 s6, s11
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6
-; GFX9-NEXT: s_xor_b32 s4, s4, s5
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: s_ashr_i32 s2, s7, 31
+; GFX9-NEXT: s_xor_b32 s5, s5, s4
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_ashr_i32 s0, s7, 31
; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v1
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: s_abs_i32 s3, s7
+; GFX9-NEXT: s_abs_i32 s1, s7
; GFX9-NEXT: s_sub_i32 s7, 0, s6
+; GFX9-NEXT: s_sub_i32 s4, s5, s4
; GFX9-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT: s_sub_i32 s4, s4, s5
+; GFX9-NEXT: v_mov_b32_e32 v0, s12
+; GFX9-NEXT: v_mov_b32_e32 v1, s8
; GFX9-NEXT: v_readfirstlane_b32 s5, v2
; GFX9-NEXT: s_mul_i32 s7, s7, s5
; GFX9-NEXT: s_mul_hi_u32 s7, s5, s7
; GFX9-NEXT: s_add_i32 s5, s5, s7
-; GFX9-NEXT: s_mul_hi_u32 s5, s3, s5
+; GFX9-NEXT: s_mul_hi_u32 s5, s1, s5
; GFX9-NEXT: s_mul_i32 s5, s5, s6
-; GFX9-NEXT: s_sub_i32 s3, s3, s5
-; GFX9-NEXT: s_sub_i32 s5, s3, s6
-; GFX9-NEXT: s_cmp_ge_u32 s3, s6
-; GFX9-NEXT: s_cselect_b32 s3, s5, s3
-; GFX9-NEXT: s_sub_i32 s5, s3, s6
-; GFX9-NEXT: s_cmp_ge_u32 s3, s6
-; GFX9-NEXT: s_cselect_b32 s3, s5, s3
-; GFX9-NEXT: s_xor_b32 s3, s3, s2
-; GFX9-NEXT: s_sub_i32 s2, s3, s2
+; GFX9-NEXT: s_sub_i32 s1, s1, s5
+; GFX9-NEXT: s_sub_i32 s5, s1, s6
+; GFX9-NEXT: s_cmp_ge_u32 s1, s6
+; GFX9-NEXT: s_cselect_b32 s1, s5, s1
+; GFX9-NEXT: s_sub_i32 s5, s1, s6
+; GFX9-NEXT: s_cmp_ge_u32 s1, s6
+; GFX9-NEXT: s_cselect_b32 s1, s5, s1
+; GFX9-NEXT: s_xor_b32 s1, s1, s0
+; GFX9-NEXT: s_sub_i32 s0, s1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: v_mov_b32_e32 v3, s2
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
%r = srem <4 x i32> %x, %y
store <4 x i32> %r, ptr addrspace(1) %out
@@ -2604,7 +2605,6 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s3, s6, 0xffff
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
@@ -2617,28 +2617,29 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4
; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4
; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_trunc_f32_e32 v4, v4
-; GFX9-NEXT: s_and_b32 s2, s7, 0xffff
+; GFX9-NEXT: s_and_b32 s0, s7, 0xffff
; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4
; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2
-; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2
+; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0
; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0
-; GFX9-NEXT: s_and_b32 s2, s5, 0xffff
+; GFX9-NEXT: s_and_b32 s0, s5, 0xffff
; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
; GFX9-NEXT: v_trunc_f32_e32 v2, v5
-; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2
+; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4
; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1
-; GFX9-NEXT: s_lshr_b32 s2, s7, 16
+; GFX9-NEXT: s_lshr_b32 s0, s7, 16
; GFX9-NEXT: v_mul_f32_e32 v1, v5, v7
; GFX9-NEXT: v_trunc_f32_e32 v1, v1
; GFX9-NEXT: v_mad_f32 v3, -v1, v4, v5
-; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2
+; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT: s_lshr_b32 s2, s5, 16
-; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s2
+; GFX9-NEXT: s_lshr_b32 s0, s5, 16
+; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5
; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
@@ -2654,7 +2655,8 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1
; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0
-; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
%r = udiv <4 x i16> %x, %y
store <4 x i16> %r, ptr addrspace(1) %out
@@ -2825,34 +2827,33 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
; GFX9-LABEL: urem_v4i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s3, s6, 0xffff
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT: s_and_b32 s2, s4, 0xffff
-; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2
+; GFX9-NEXT: s_and_b32 s9, s6, 0xffff
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9
+; GFX9-NEXT: s_and_b32 s8, s4, 0xffff
; GFX9-NEXT: s_lshr_b32 s6, s6, 16
-; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6
+; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s8
+; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0
; GFX9-NEXT: s_lshr_b32 s4, s4, 16
; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4
+; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1
; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4
; GFX9-NEXT: v_trunc_f32_e32 v4, v4
+; GFX9-NEXT: s_and_b32 s0, s7, 0xffff
; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4
; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2
-; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0
-; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1
-; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
-; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3
-; GFX9-NEXT: s_and_b32 s3, s7, 0xffff
-; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s3
+; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0
; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5
+; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0
; GFX9-NEXT: v_trunc_f32_e32 v2, v5
-; GFX9-NEXT: s_and_b32 s8, s5, 0xffff
+; GFX9-NEXT: s_and_b32 s1, s5, 0xffff
+; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s8
+; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s1
; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
@@ -2867,24 +2868,25 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v5
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v4
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
+; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9
; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
; GFX9-NEXT: v_mul_f32_e32 v3, v7, v8
; GFX9-NEXT: v_trunc_f32_e32 v3, v3
; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3
; GFX9-NEXT: v_mad_f32 v3, -v3, v5, v7
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5
-; GFX9-NEXT: v_mul_lo_u32 v2, v2, s3
+; GFX9-NEXT: v_mul_lo_u32 v2, v2, s0
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
; GFX9-NEXT: v_mul_lo_u32 v3, v3, s6
+; GFX9-NEXT: v_sub_u32_e32 v0, s8, v0
; GFX9-NEXT: v_sub_u32_e32 v4, s4, v1
-; GFX9-NEXT: v_sub_u32_e32 v1, s8, v2
-; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX9-NEXT: v_sub_u32_e32 v1, s1, v2
; GFX9-NEXT: v_sub_u32_e32 v2, s5, v3
+; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0
-; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
%r = urem <4 x i16> %x, %y
store <4 x i16> %r, ptr addrspace(1) %out
@@ -3563,27 +3565,27 @@ define amdgpu_kernel void @urem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
;
; GFX9-LABEL: urem_i3:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_bfe_u32 s3, s2, 0x30008
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX9-NEXT: s_bfe_u32 s2, s4, 0x30008
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
-; GFX9-NEXT: s_and_b32 s4, s2, 7
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s4
-; GFX9-NEXT: s_lshr_b32 s3, s2, 8
+; GFX9-NEXT: s_and_b32 s3, s4, 7
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, s3
+; GFX9-NEXT: s_lshr_b32 s2, s4, 8
; GFX9-NEXT: v_mul_f32_e32 v1, v2, v1
; GFX9-NEXT: v_trunc_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v1
; GFX9-NEXT: v_mad_f32 v1, -v1, v0, v2
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
-; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3
; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
+; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
+; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0
; GFX9-NEXT: v_and_b32_e32 v0, 7, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_byte v1, v0, s[0:1]
+; GFX9-NEXT: global_store_byte v1, v0, s[2:3]
; GFX9-NEXT: s_endpgm
%r = urem i3 %x, %y
store i3 %r, ptr addrspace(1) %out
@@ -3753,12 +3755,12 @@ define amdgpu_kernel void @srem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
; GFX9-NEXT: s_cselect_b32 s2, s6, 0
; GFX9-NEXT: v_add_u32_e32 v0, s2, v2
; GFX9-NEXT: v_mul_lo_u32 v0, v0, s5
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0
; GFX9-NEXT: v_and_b32_e32 v0, 7, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_byte v1, v0, s[0:1]
+; GFX9-NEXT: global_store_byte v1, v0, s[2:3]
; GFX9-NEXT: s_endpgm
%r = srem i3 %x, %y
store i3 %r, ptr addrspace(1) %out
@@ -3881,7 +3883,6 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s3, s6, 0xffff
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
@@ -3894,19 +3895,20 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4
; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1
; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_trunc_f32_e32 v4, v4
-; GFX9-NEXT: s_and_b32 s2, s7, 0xffff
+; GFX9-NEXT: s_and_b32 s0, s7, 0xffff
; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v4
; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2
-; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2
+; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0
; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0
; GFX9-NEXT: v_trunc_f32_e32 v2, v5
-; GFX9-NEXT: s_and_b32 s2, s5, 0xffff
+; GFX9-NEXT: s_and_b32 s0, s5, 0xffff
; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
; GFX9-NEXT: v_mad_f32 v3, -v2, v1, v3
; GFX9-NEXT: v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s2
+; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v4
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
@@ -3918,8 +3920,9 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc
; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-NEXT: global_store_short v6, v2, s[0:1] offset:4
-; GFX9-NEXT: global_store_dword v6, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_short v6, v2, s[2:3] offset:4
+; GFX9-NEXT: global_store_dword v6, v0, s[2:3]
; GFX9-NEXT: s_endpgm
%r = udiv <3 x i16> %x, %y
store <3 x i16> %r, ptr addrspace(1) %out
@@ -4053,32 +4056,32 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
; GFX9-LABEL: urem_v3i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s3, s6, 0xffff
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT: s_and_b32 s2, s4, 0xffff
-; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2
+; GFX9-NEXT: s_and_b32 s9, s6, 0xffff
; GFX9-NEXT: s_lshr_b32 s6, s6, 16
-; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s9
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s6
+; GFX9-NEXT: s_and_b32 s8, s4, 0xffff
; GFX9-NEXT: s_lshr_b32 s4, s4, 16
+; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s8
+; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v0
; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s4
+; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1
+; GFX9-NEXT: s_and_b32 s0, s7, 0xffff
; GFX9-NEXT: v_mul_f32_e32 v4, v2, v4
; GFX9-NEXT: v_trunc_f32_e32 v4, v4
-; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1
-; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v4
-; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2
-; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0
; GFX9-NEXT: v_mul_f32_e32 v5, v3, v5
-; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc
; GFX9-NEXT: v_trunc_f32_e32 v5, v5
-; GFX9-NEXT: v_mul_lo_u32 v0, v0, s3
-; GFX9-NEXT: s_and_b32 s3, s7, 0xffff
+; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2
+; GFX9-NEXT: v_cvt_u32_f32_e32 v6, v4
+; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0
; GFX9-NEXT: v_mad_f32 v2, -v5, v1, v3
-; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s3
-; GFX9-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX9-NEXT: v_cvt_f32_u32_e32 v3, s0
+; GFX9-NEXT: s_and_b32 s1, s5, 0xffff
+; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v6, vcc
; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v5
-; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s5
+; GFX9-NEXT: v_cvt_f32_u32_e32 v5, s1
; GFX9-NEXT: v_rcp_iflag_f32_e32 v6, v3
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v1
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v4, vcc
@@ -4087,19 +4090,18 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v2
; GFX9-NEXT: v_mad_f32 v2, -v2, v3, v5
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: v_mul_lo_u32 v0, v0, s9
; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v4, vcc
; GFX9-NEXT: v_mul_lo_u32 v1, v1, s6
-; GFX9-NEXT: v_mul_lo_u32 v2, v2, s3
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
+; GFX9-NEXT: v_mul_lo_u32 v2, v2, s0
+; GFX9-NEXT: v_sub_u32_e32 v0, s8, v0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: v_sub_u32_e32 v1, s4, v1
-; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2
+; GFX9-NEXT: v_sub_u32_e32 v2, s1, v2
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v3, v2, s[0:1] offset:4
-; GFX9-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX9-NEXT: global_store_short v3, v2, s[2:3] offset:4
+; GFX9-NEXT: global_store_dword v3, v0, s[2:3]
; GFX9-NEXT: s_endpgm
%r = urem <3 x i16> %x, %y
store <3 x i16> %r, ptr addrspace(1) %out
@@ -4465,58 +4467,58 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
; GFX9-LABEL: srem_v3i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_sext_i32_i16 s8, s6
; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s8
; GFX9-NEXT: s_sext_i32_i16 s9, s4
; GFX9-NEXT: v_cvt_f32_i32_e32 v1, s9
-; GFX9-NEXT: s_xor_b32 s2, s9, s8
+; GFX9-NEXT: s_xor_b32 s0, s9, s8
; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v0
-; GFX9-NEXT: s_ashr_i32 s2, s2, 30
-; GFX9-NEXT: s_or_b32 s10, s2, 1
+; GFX9-NEXT: s_ashr_i32 s0, s0, 30
+; GFX9-NEXT: s_or_b32 s10, s0, 1
; GFX9-NEXT: s_sext_i32_i16 s7, s7
; GFX9-NEXT: v_mul_f32_e32 v2, v1, v2
; GFX9-NEXT: v_trunc_f32_e32 v2, v2
; GFX9-NEXT: v_mad_f32 v1, -v2, v0, v1
-; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
-; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec
-; GFX9-NEXT: s_cselect_b32 s2, s10, 0
+; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, |v0|
+; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT: s_cselect_b32 s0, s10, 0
; GFX9-NEXT: s_ashr_i32 s6, s6, 16
; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s6
; GFX9-NEXT: s_ashr_i32 s4, s4, 16
; GFX9-NEXT: s_sext_i32_i16 s5, s5
-; GFX9-NEXT: v_add_u32_e32 v1, s2, v2
+; GFX9-NEXT: v_add_u32_e32 v1, s0, v2
; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s4
; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v0
-; GFX9-NEXT: s_xor_b32 s2, s4, s6
-; GFX9-NEXT: s_ashr_i32 s2, s2, 30
+; GFX9-NEXT: s_xor_b32 s0, s4, s6
+; GFX9-NEXT: s_ashr_i32 s0, s0, 30
; GFX9-NEXT: v_mul_lo_u32 v1, v1, s8
; GFX9-NEXT: v_mul_f32_e32 v3, v2, v3
; GFX9-NEXT: v_trunc_f32_e32 v3, v3
; GFX9-NEXT: v_mad_f32 v2, -v3, v0, v2
-; GFX9-NEXT: s_or_b32 s8, s2, 1
+; GFX9-NEXT: s_or_b32 s8, s0, 1
; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3
-; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v0|
+; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
; GFX9-NEXT: v_cvt_f32_i32_e32 v2, s7
-; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec
-; GFX9-NEXT: s_cselect_b32 s2, s8, 0
-; GFX9-NEXT: v_add_u32_e32 v0, s2, v3
+; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT: s_cselect_b32 s0, s8, 0
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v3
; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s5
; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v2
-; GFX9-NEXT: s_xor_b32 s2, s5, s7
-; GFX9-NEXT: s_ashr_i32 s2, s2, 30
+; GFX9-NEXT: s_xor_b32 s0, s5, s7
+; GFX9-NEXT: s_ashr_i32 s0, s0, 30
; GFX9-NEXT: v_mul_lo_u32 v0, v0, s6
; GFX9-NEXT: v_mul_f32_e32 v4, v3, v4
; GFX9-NEXT: v_trunc_f32_e32 v4, v4
; GFX9-NEXT: v_mad_f32 v3, -v4, v2, v3
; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4
-; GFX9-NEXT: s_or_b32 s6, s2, 1
-; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v3|, |v2|
-; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec
-; GFX9-NEXT: s_cselect_b32 s2, s6, 0
-; GFX9-NEXT: v_add_u32_e32 v2, s2, v4
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_or_b32 s6, s0, 1
+; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v3|, |v2|
+; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT: s_cselect_b32 s0, s6, 0
+; GFX9-NEXT: v_add_u32_e32 v2, s0, v4
; GFX9-NEXT: v_mul_lo_u32 v2, v2, s7
; GFX9-NEXT: v_sub_u32_e32 v1, s9, v1
; GFX9-NEXT: v_mov_b32_e32 v3, 0
@@ -4524,9 +4526,8 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v3, v2, s[0:1] offset:4
-; GFX9-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX9-NEXT: global_store_short v3, v2, s[2:3] offset:4
+; GFX9-NEXT: global_store_dword v3, v0, s[2:3]
; GFX9-NEXT: s_endpgm
%r = srem <3 x i16> %x, %y
store <3 x i16> %r, ptr addrspace(1) %out
@@ -4854,28 +4855,28 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
; GFX9-LABEL: urem_v3i15:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s6
; GFX9-NEXT: v_alignbit_b32 v0, s7, v0, 30
-; GFX9-NEXT: s_and_b32 s7, s0, 0x7fff
+; GFX9-NEXT: s_and_b32 s7, s2, 0x7fff
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
-; GFX9-NEXT: s_and_b32 s2, s6, 0x7fff
-; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s2
-; GFX9-NEXT: s_bfe_u32 s2, s0, 0xf000f
+; GFX9-NEXT: s_and_b32 s0, s6, 0x7fff
+; GFX9-NEXT: v_cvt_f32_u32_e32 v4, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: v_rcp_iflag_f32_e32 v5, v1
-; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s2
-; GFX9-NEXT: v_mov_b32_e32 v3, s0
-; GFX9-NEXT: v_alignbit_b32 v3, s1, v3, 30
+; GFX9-NEXT: v_alignbit_b32 v3, s3, v3, 30
+; GFX9-NEXT: s_bfe_u32 s3, s2, 0xf000f
+; GFX9-NEXT: v_cvt_f32_u32_e32 v6, s3
; GFX9-NEXT: v_mul_f32_e32 v5, v4, v5
; GFX9-NEXT: v_trunc_f32_e32 v5, v5
; GFX9-NEXT: v_mad_f32 v4, -v5, v1, v4
; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v5
-; GFX9-NEXT: s_bfe_u32 s3, s6, 0xf000f
+; GFX9-NEXT: s_bfe_u32 s1, s6, 0xf000f
; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff, v3
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v1
-; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s3
+; GFX9-NEXT: v_cvt_f32_u32_e32 v7, s1
; GFX9-NEXT: v_rcp_iflag_f32_e32 v8, v6
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
; GFX9-NEXT: v_cvt_f32_u32_e32 v5, v3
@@ -4892,11 +4893,11 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
; GFX9-NEXT: v_cvt_u32_f32_e32 v7, v6
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
; GFX9-NEXT: v_mad_f32 v6, -v6, v5, v8
-; GFX9-NEXT: s_lshr_b32 s1, s0, 15
+; GFX9-NEXT: s_lshr_b32 s0, s2, 15
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, v5
-; GFX9-NEXT: v_mul_lo_u32 v4, v4, s1
+; GFX9-NEXT: v_mul_lo_u32 v4, v4, s0
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v7, vcc
-; GFX9-NEXT: v_mul_lo_u32 v1, v1, s0
+; GFX9-NEXT: v_mul_lo_u32 v1, v1, s2
; GFX9-NEXT: v_mul_lo_u32 v3, v5, v3
; GFX9-NEXT: s_lshr_b32 s0, s6, 15
; GFX9-NEXT: v_sub_u32_e32 v4, s0, v4
@@ -5717,54 +5718,54 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-LABEL: udiv_v2i32_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s6
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s7
-; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2
-; GFX9-NEXT: s_sub_i32 s6, 0, s3
+; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s6
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
+; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s7
+; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
+; GFX9-NEXT: s_sub_i32 s0, 0, s6
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT: v_readfirstlane_b32 s7, v0
-; GFX9-NEXT: s_mul_i32 s6, s6, s7
-; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_mul_hi_u32 s6, s4, s7
-; GFX9-NEXT: s_mul_i32 s7, s6, s3
-; GFX9-NEXT: s_sub_i32 s4, s4, s7
-; GFX9-NEXT: s_add_i32 s9, s6, 1
-; GFX9-NEXT: s_sub_i32 s7, s4, s3
-; GFX9-NEXT: s_cmp_ge_u32 s4, s3
-; GFX9-NEXT: s_cselect_b32 s6, s9, s6
-; GFX9-NEXT: s_cselect_b32 s4, s7, s4
-; GFX9-NEXT: s_add_i32 s7, s6, 1
-; GFX9-NEXT: s_cmp_ge_u32 s4, s3
+; GFX9-NEXT: v_readfirstlane_b32 s1, v0
+; GFX9-NEXT: s_mul_i32 s0, s0, s1
+; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0
+; GFX9-NEXT: s_add_i32 s1, s1, s0
+; GFX9-NEXT: s_mul_hi_u32 s0, s4, s1
+; GFX9-NEXT: s_mul_i32 s1, s0, s6
+; GFX9-NEXT: s_sub_i32 s1, s4, s1
+; GFX9-NEXT: s_add_i32 s9, s0, 1
+; GFX9-NEXT: s_sub_i32 s4, s1, s6
+; GFX9-NEXT: s_cmp_ge_u32 s1, s6
+; GFX9-NEXT: s_cselect_b32 s0, s9, s0
+; GFX9-NEXT: s_cselect_b32 s1, s4, s1
+; GFX9-NEXT: s_add_i32 s4, s0, 1
+; GFX9-NEXT: s_cmp_ge_u32 s1, s6
; GFX9-NEXT: v_readfirstlane_b32 s8, v1
-; GFX9-NEXT: s_cselect_b32 s3, s7, s6
-; GFX9-NEXT: s_sub_i32 s4, 0, s2
-; GFX9-NEXT: s_mul_i32 s4, s4, s8
-; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4
-; GFX9-NEXT: s_add_i32 s8, s8, s4
-; GFX9-NEXT: s_mul_hi_u32 s4, s5, s8
-; GFX9-NEXT: s_mul_i32 s6, s4, s2
-; GFX9-NEXT: s_sub_i32 s5, s5, s6
-; GFX9-NEXT: s_add_i32 s7, s4, 1
-; GFX9-NEXT: s_sub_i32 s6, s5, s2
-; GFX9-NEXT: s_cmp_ge_u32 s5, s2
-; GFX9-NEXT: s_cselect_b32 s4, s7, s4
-; GFX9-NEXT: s_cselect_b32 s5, s6, s5
-; GFX9-NEXT: s_add_i32 s6, s4, 1
-; GFX9-NEXT: s_cmp_ge_u32 s5, s2
-; GFX9-NEXT: s_cselect_b32 s2, s6, s4
-; GFX9-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_cselect_b32 s0, s4, s0
+; GFX9-NEXT: s_sub_i32 s1, 0, s7
+; GFX9-NEXT: s_mul_i32 s1, s1, s8
+; GFX9-NEXT: s_mul_hi_u32 s1, s8, s1
+; GFX9-NEXT: s_add_i32 s8, s8, s1
+; GFX9-NEXT: s_mul_hi_u32 s1, s5, s8
+; GFX9-NEXT: s_mul_i32 s4, s1, s7
+; GFX9-NEXT: s_sub_i32 s4, s5, s4
+; GFX9-NEXT: s_add_i32 s6, s1, 1
+; GFX9-NEXT: s_sub_i32 s5, s4, s7
+; GFX9-NEXT: s_cmp_ge_u32 s4, s7
+; GFX9-NEXT: s_cselect_b32 s1, s6, s1
+; GFX9-NEXT: s_cselect_b32 s4, s5, s4
+; GFX9-NEXT: s_add_i32 s5, s1, 1
+; GFX9-NEXT: s_cmp_ge_u32 s4, s7
+; GFX9-NEXT: s_cselect_b32 s1, s5, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
%shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
%r = udiv <2 x i32> %x, %shl.y
@@ -6051,50 +6052,50 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-LABEL: urem_v2i32_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b32 s3, 0x1000, s6
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s7
-; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s2
-; GFX9-NEXT: s_sub_i32 s6, 0, s3
+; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s6
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
+; GFX9-NEXT: s_lshl_b32 s7, 0x1000, s7
+; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
+; GFX9-NEXT: s_sub_i32 s0, 0, s6
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT: v_readfirstlane_b32 s7, v0
-; GFX9-NEXT: s_mul_i32 s6, s6, s7
-; GFX9-NEXT: s_mul_hi_u32 s6, s7, s6
-; GFX9-NEXT: s_add_i32 s7, s7, s6
-; GFX9-NEXT: s_mul_hi_u32 s6, s4, s7
-; GFX9-NEXT: s_mul_i32 s6, s6, s3
-; GFX9-NEXT: s_sub_i32 s4, s4, s6
-; GFX9-NEXT: s_sub_i32 s6, s4, s3
-; GFX9-NEXT: s_cmp_ge_u32 s4, s3
-; GFX9-NEXT: s_cselect_b32 s4, s6, s4
-; GFX9-NEXT: s_sub_i32 s6, s4, s3
-; GFX9-NEXT: s_cmp_ge_u32 s4, s3
+; GFX9-NEXT: v_readfirstlane_b32 s1, v0
+; GFX9-NEXT: s_mul_i32 s0, s0, s1
+; GFX9-NEXT: s_mul_hi_u32 s0, s1, s0
+; GFX9-NEXT: s_add_i32 s1, s1, s0
+; GFX9-NEXT: s_mul_hi_u32 s0, s4, s1
+; GFX9-NEXT: s_mul_i32 s0, s0, s6
+; GFX9-NEXT: s_sub_i32 s0, s4, s0
+; GFX9-NEXT: s_sub_i32 s1, s0, s6
+; GFX9-NEXT: s_cmp_ge_u32 s0, s6
+; GFX9-NEXT: s_cselect_b32 s0, s1, s0
+; GFX9-NEXT: s_sub_i32 s1, s0, s6
+; GFX9-NEXT: s_cmp_ge_u32 s0, s6
; GFX9-NEXT: v_readfirstlane_b32 s8, v1
-; GFX9-NEXT: s_cselect_b32 s3, s6, s4
-; GFX9-NEXT: s_sub_i32 s4, 0, s2
-; GFX9-NEXT: s_mul_i32 s4, s4, s8
-; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4
-; GFX9-NEXT: s_add_i32 s8, s8, s4
-; GFX9-NEXT: s_mul_hi_u32 s4, s5, s8
-; GFX9-NEXT: s_mul_i32 s4, s4, s2
-; GFX9-NEXT: s_sub_i32 s4, s5, s4
-; GFX9-NEXT: s_sub_i32 s5, s4, s2
-; GFX9-NEXT: s_cmp_ge_u32 s4, s2
-; GFX9-NEXT: s_cselect_b32 s4, s5, s4
-; GFX9-NEXT: s_sub_i32 s5, s4, s2
-; GFX9-NEXT: s_cmp_ge_u32 s4, s2
-; GFX9-NEXT: s_cselect_b32 s2, s5, s4
-; GFX9-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_cselect_b32 s0, s1, s0
+; GFX9-NEXT: s_sub_i32 s1, 0, s7
+; GFX9-NEXT: s_mul_i32 s1, s1, s8
+; GFX9-NEXT: s_mul_hi_u32 s1, s8, s1
+; GFX9-NEXT: s_add_i32 s8, s8, s1
+; GFX9-NEXT: s_mul_hi_u32 s1, s5, s8
+; GFX9-NEXT: s_mul_i32 s1, s1, s7
+; GFX9-NEXT: s_sub_i32 s1, s5, s1
+; GFX9-NEXT: s_sub_i32 s4, s1, s7
+; GFX9-NEXT: s_cmp_ge_u32 s1, s7
+; GFX9-NEXT: s_cselect_b32 s1, s4, s1
+; GFX9-NEXT: s_sub_i32 s4, s1, s7
+; GFX9-NEXT: s_cmp_ge_u32 s1, s7
+; GFX9-NEXT: s_cselect_b32 s1, s4, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
%shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
%r = urem <2 x i32> %x, %shl.y
@@ -6546,65 +6547,66 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s6
; GFX9-NEXT: s_abs_i32 s3, s2
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
+; GFX9-NEXT: s_xor_b32 s2, s4, s2
; GFX9-NEXT: s_lshl_b32 s6, 0x1000, s7
; GFX9-NEXT: s_abs_i32 s7, s4
-; GFX9-NEXT: s_xor_b32 s2, s4, s2
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_sub_i32 s4, 0, s3
-; GFX9-NEXT: s_ashr_i32 s2, s2, 31
+; GFX9-NEXT: s_ashr_i32 s4, s2, 31
+; GFX9-NEXT: s_sub_i32 s2, 0, s3
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_readfirstlane_b32 s8, v0
-; GFX9-NEXT: s_mul_i32 s4, s4, s8
-; GFX9-NEXT: s_mul_hi_u32 s4, s8, s4
-; GFX9-NEXT: s_add_i32 s8, s8, s4
-; GFX9-NEXT: s_mul_hi_u32 s4, s7, s8
-; GFX9-NEXT: s_mul_i32 s8, s4, s3
+; GFX9-NEXT: s_mul_i32 s2, s2, s8
+; GFX9-NEXT: s_mul_hi_u32 s2, s8, s2
+; GFX9-NEXT: s_add_i32 s8, s8, s2
+; GFX9-NEXT: s_mul_hi_u32 s2, s7, s8
+; GFX9-NEXT: s_mul_i32 s8, s2, s3
; GFX9-NEXT: s_sub_i32 s7, s7, s8
-; GFX9-NEXT: s_add_i32 s9, s4, 1
+; GFX9-NEXT: s_add_i32 s9, s2, 1
; GFX9-NEXT: s_sub_i32 s8, s7, s3
; GFX9-NEXT: s_cmp_ge_u32 s7, s3
-; GFX9-NEXT: s_cselect_b32 s4, s9, s4
+; GFX9-NEXT: s_cselect_b32 s2, s9, s2
; GFX9-NEXT: s_cselect_b32 s7, s8, s7
-; GFX9-NEXT: s_add_i32 s8, s4, 1
+; GFX9-NEXT: s_add_i32 s8, s2, 1
; GFX9-NEXT: s_cmp_ge_u32 s7, s3
-; GFX9-NEXT: s_cselect_b32 s3, s8, s4
-; GFX9-NEXT: s_abs_i32 s4, s6
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4
-; GFX9-NEXT: s_xor_b32 s3, s3, s2
-; GFX9-NEXT: s_sub_i32 s7, 0, s4
-; GFX9-NEXT: s_sub_i32 s2, s3, s2
+; GFX9-NEXT: s_cselect_b32 s7, s8, s2
+; GFX9-NEXT: s_abs_i32 s8, s6
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s8
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_xor_b32 s0, s5, s6
+; GFX9-NEXT: s_abs_i32 s1, s5
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_xor_b32 s6, s5, s6
-; GFX9-NEXT: s_abs_i32 s5, s5
-; GFX9-NEXT: s_ashr_i32 s6, s6, 31
+; GFX9-NEXT: s_xor_b32 s5, s7, s4
+; GFX9-NEXT: s_sub_i32 s6, 0, s8
+; GFX9-NEXT: s_sub_i32 s4, s5, s4
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s3, v0
-; GFX9-NEXT: s_mul_i32 s7, s7, s3
-; GFX9-NEXT: s_mul_hi_u32 s7, s3, s7
-; GFX9-NEXT: s_add_i32 s3, s3, s7
-; GFX9-NEXT: s_mul_hi_u32 s3, s5, s3
-; GFX9-NEXT: s_mul_i32 s7, s3, s4
-; GFX9-NEXT: s_sub_i32 s5, s5, s7
-; GFX9-NEXT: s_add_i32 s8, s3, 1
-; GFX9-NEXT: s_sub_i32 s7, s5, s4
-; GFX9-NEXT: s_cmp_ge_u32 s5, s4
-; GFX9-NEXT: s_cselect_b32 s3, s8, s3
+; GFX9-NEXT: s_ashr_i32 s0, s0, 31
+; GFX9-NEXT: v_readfirstlane_b32 s5, v0
+; GFX9-NEXT: s_mul_i32 s6, s6, s5
+; GFX9-NEXT: s_mul_hi_u32 s6, s5, s6
+; GFX9-NEXT: s_add_i32 s5, s5, s6
+; GFX9-NEXT: s_mul_hi_u32 s5, s1, s5
+; GFX9-NEXT: s_mul_i32 s6, s5, s8
+; GFX9-NEXT: s_sub_i32 s1, s1, s6
+; GFX9-NEXT: s_add_i32 s7, s5, 1
+; GFX9-NEXT: s_sub_i32 s6, s1, s8
+; GFX9-NEXT: s_cmp_ge_u32 s1, s8
; GFX9-NEXT: s_cselect_b32 s5, s7, s5
-; GFX9-NEXT: s_add_i32 s7, s3, 1
-; GFX9-NEXT: s_cmp_ge_u32 s5, s4
-; GFX9-NEXT: s_cselect_b32 s3, s7, s3
-; GFX9-NEXT: s_xor_b32 s3, s3, s6
-; GFX9-NEXT: s_sub_i32 s3, s3, s6
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_cselect_b32 s1, s6, s1
+; GFX9-NEXT: s_add_i32 s6, s5, 1
+; GFX9-NEXT: s_cmp_ge_u32 s1, s8
+; GFX9-NEXT: s_cselect_b32 s1, s6, s5
+; GFX9-NEXT: s_xor_b32 s1, s1, s0
+; GFX9-NEXT: s_sub_i32 s0, s1, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
%shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
%r = sdiv <2 x i32> %x, %shl.y
@@ -6989,7 +6991,6 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b32 s2, 0x1000, s6
; GFX9-NEXT: s_abs_i32 s2, s2
@@ -7013,35 +7014,37 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-NEXT: s_cselect_b32 s4, s7, s4
; GFX9-NEXT: s_sub_i32 s7, s4, s2
; GFX9-NEXT: s_cmp_ge_u32 s4, s2
-; GFX9-NEXT: s_cselect_b32 s2, s7, s4
-; GFX9-NEXT: s_abs_i32 s3, s3
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
-; GFX9-NEXT: s_xor_b32 s2, s2, s6
-; GFX9-NEXT: s_sub_i32 s7, 0, s3
-; GFX9-NEXT: s_sub_i32 s2, s2, s6
+; GFX9-NEXT: s_cselect_b32 s4, s7, s4
+; GFX9-NEXT: s_abs_i32 s7, s3
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s7
+; GFX9-NEXT: s_xor_b32 s4, s4, s6
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_ashr_i32 s0, s5, 31
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT: s_ashr_i32 s4, s5, 31
-; GFX9-NEXT: s_abs_i32 s5, s5
+; GFX9-NEXT: s_abs_i32 s1, s5
+; GFX9-NEXT: s_sub_i32 s5, 0, s7
+; GFX9-NEXT: s_sub_i32 s4, s4, s6
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_readfirstlane_b32 s6, v0
-; GFX9-NEXT: s_mul_i32 s7, s7, s6
-; GFX9-NEXT: s_mul_hi_u32 s7, s6, s7
-; GFX9-NEXT: s_add_i32 s6, s6, s7
-; GFX9-NEXT: s_mul_hi_u32 s6, s5, s6
-; GFX9-NEXT: s_mul_i32 s6, s6, s3
-; GFX9-NEXT: s_sub_i32 s5, s5, s6
-; GFX9-NEXT: s_sub_i32 s6, s5, s3
-; GFX9-NEXT: s_cmp_ge_u32 s5, s3
-; GFX9-NEXT: s_cselect_b32 s5, s6, s5
-; GFX9-NEXT: s_sub_i32 s6, s5, s3
-; GFX9-NEXT: s_cmp_ge_u32 s5, s3
-; GFX9-NEXT: s_cselect_b32 s3, s6, s5
-; GFX9-NEXT: s_xor_b32 s3, s3, s4
-; GFX9-NEXT: s_sub_i32 s3, s3, s4
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_mul_i32 s5, s5, s6
+; GFX9-NEXT: s_mul_hi_u32 s5, s6, s5
+; GFX9-NEXT: s_add_i32 s6, s6, s5
+; GFX9-NEXT: s_mul_hi_u32 s5, s1, s6
+; GFX9-NEXT: s_mul_i32 s5, s5, s7
+; GFX9-NEXT: s_sub_i32 s1, s1, s5
+; GFX9-NEXT: s_sub_i32 s5, s1, s7
+; GFX9-NEXT: s_cmp_ge_u32 s1, s7
+; GFX9-NEXT: s_cselect_b32 s1, s5, s1
+; GFX9-NEXT: s_sub_i32 s5, s1, s7
+; GFX9-NEXT: s_cmp_ge_u32 s1, s7
+; GFX9-NEXT: s_cselect_b32 s1, s5, s1
+; GFX9-NEXT: s_xor_b32 s1, s1, s0
+; GFX9-NEXT: s_sub_i32 s0, s1, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
%shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
%r = srem <2 x i32> %x, %shl.y
@@ -7281,13 +7284,13 @@ define amdgpu_kernel void @udiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
;
; GFX9-LABEL: udiv_i64_pow2k_denom:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 12
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_lshr_b64 s[0:1], s[6:7], 12
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
%r = udiv i64 %x, 4096
store i64 %r, ptr addrspace(1) %out
@@ -7614,18 +7617,18 @@ define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-LABEL: udiv_v2i64_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_i32 s2, s8, 12
+; GFX9-NEXT: s_add_i32 s0, s8, 12
; GFX9-NEXT: s_add_i32 s8, s10, 12
-; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], s2
+; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], s0
; GFX9-NEXT: s_lshr_b64 s[4:5], s[6:7], s8
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
%shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
%r = udiv <2 x i64> %x, %shl.y
@@ -7862,12 +7865,12 @@ define amdgpu_kernel void @urem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
;
; GFX9-LABEL: urem_i64_pow2k_denom:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s2, s2, 0xfff
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX9-NEXT: s_and_b32 s0, s6, 0xfff
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: global_store_dwordx2 v1, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
%r = urem i64 %x, 4096
store i64 %r, ptr addrspace(1) %out
@@ -8003,22 +8006,22 @@ define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX9-LABEL: urem_v2i64_pow2_shl_denom:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b64 s[2:3], 0x1000, s10
+; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s10
; GFX9-NEXT: s_lshl_b64 s[8:9], 0x1000, s8
; GFX9-NEXT: s_add_u32 s8, s8, -1
; GFX9-NEXT: s_addc_u32 s9, s9, -1
; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], s[8:9]
-; GFX9-NEXT: s_add_u32 s2, s2, -1
-; GFX9-NEXT: s_addc_u32 s3, s3, -1
-; GFX9-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3]
+; GFX9-NEXT: s_add_u32 s0, s0, -1
+; GFX9-NEXT: s_addc_u32 s1, s1, -1
+; GFX9-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
%shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
%r = urem <2 x i64> %x, %shl.y
@@ -8129,58 +8132,58 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
;
; GFX9-LABEL: sdiv_i64_oddk_denom:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s4, 0x33fe64
-; GFX9-NEXT: s_add_u32 s4, 0x396, s4
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s0, 0x33fe64
+; GFX9-NEXT: s_add_u32 s0, 0x396, s0
; GFX9-NEXT: v_mov_b32_e32 v0, 0x28100000
-; GFX9-NEXT: s_addc_u32 s5, 0, 0
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0
+; GFX9-NEXT: s_addc_u32 s1, 0, 0
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT: s_addc_u32 s4, s5, 0xd95
-; GFX9-NEXT: v_readfirstlane_b32 s6, v0
-; GFX9-NEXT: s_mul_i32 s5, s4, 0xffed2705
-; GFX9-NEXT: s_mul_hi_u32 s7, s6, 0xffed2705
-; GFX9-NEXT: s_add_i32 s7, s7, s5
-; GFX9-NEXT: s_sub_i32 s5, s7, s6
-; GFX9-NEXT: s_mul_i32 s8, s6, 0xffed2705
-; GFX9-NEXT: s_mul_hi_u32 s11, s6, s5
-; GFX9-NEXT: s_mul_i32 s12, s6, s5
-; GFX9-NEXT: s_mul_hi_u32 s6, s6, s8
-; GFX9-NEXT: s_add_u32 s6, s6, s12
-; GFX9-NEXT: s_mul_hi_u32 s9, s4, s8
-; GFX9-NEXT: s_mul_i32 s10, s4, s8
+; GFX9-NEXT: s_addc_u32 s0, s1, 0xd95
+; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: s_mul_i32 s1, s0, 0xffed2705
+; GFX9-NEXT: s_mul_hi_u32 s3, s2, 0xffed2705
+; GFX9-NEXT: s_add_i32 s3, s3, s1
+; GFX9-NEXT: s_sub_i32 s1, s3, s2
+; GFX9-NEXT: s_mul_i32 s8, s2, 0xffed2705
+; GFX9-NEXT: s_mul_hi_u32 s11, s2, s1
+; GFX9-NEXT: s_mul_i32 s12, s2, s1
+; GFX9-NEXT: s_mul_hi_u32 s2, s2, s8
+; GFX9-NEXT: s_add_u32 s2, s2, s12
+; GFX9-NEXT: s_mul_hi_u32 s9, s0, s8
+; GFX9-NEXT: s_mul_i32 s10, s0, s8
; GFX9-NEXT: s_addc_u32 s8, 0, s11
-; GFX9-NEXT: s_add_u32 s6, s6, s10
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mul_hi_u32 s7, s4, s5
-; GFX9-NEXT: s_addc_u32 s6, s8, s9
-; GFX9-NEXT: s_addc_u32 s7, s7, 0
-; GFX9-NEXT: s_mul_i32 s5, s4, s5
-; GFX9-NEXT: s_add_u32 s5, s6, s5
-; GFX9-NEXT: s_addc_u32 s6, 0, s7
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s5, v0
+; GFX9-NEXT: s_add_u32 s2, s2, s10
+; GFX9-NEXT: s_mul_hi_u32 s3, s0, s1
+; GFX9-NEXT: s_addc_u32 s2, s8, s9
+; GFX9-NEXT: s_addc_u32 s3, s3, 0
+; GFX9-NEXT: s_mul_i32 s1, s0, s1
+; GFX9-NEXT: s_add_u32 s1, s2, s1
+; GFX9-NEXT: s_addc_u32 s2, 0, s3
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s1, v0
; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT: s_addc_u32 s6, s4, s6
+; GFX9-NEXT: s_addc_u32 s8, s0, s2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_ashr_i32 s4, s3, 31
-; GFX9-NEXT: s_add_u32 s2, s2, s4
-; GFX9-NEXT: s_mov_b32 s5, s4
-; GFX9-NEXT: s_addc_u32 s3, s3, s4
-; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5]
+; GFX9-NEXT: s_ashr_i32 s0, s7, 31
+; GFX9-NEXT: s_add_u32 s2, s6, s0
+; GFX9-NEXT: s_mov_b32 s1, s0
+; GFX9-NEXT: s_addc_u32 s3, s7, s0
+; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1]
; GFX9-NEXT: v_readfirstlane_b32 s9, v0
-; GFX9-NEXT: s_mul_i32 s8, s2, s6
+; GFX9-NEXT: s_mul_i32 s7, s2, s8
; GFX9-NEXT: s_mul_hi_u32 s10, s2, s9
-; GFX9-NEXT: s_mul_hi_u32 s7, s2, s6
-; GFX9-NEXT: s_add_u32 s8, s10, s8
-; GFX9-NEXT: s_addc_u32 s7, 0, s7
+; GFX9-NEXT: s_mul_hi_u32 s6, s2, s8
+; GFX9-NEXT: s_add_u32 s7, s10, s7
+; GFX9-NEXT: s_addc_u32 s6, 0, s6
; GFX9-NEXT: s_mul_hi_u32 s11, s3, s9
; GFX9-NEXT: s_mul_i32 s9, s3, s9
-; GFX9-NEXT: s_add_u32 s8, s8, s9
-; GFX9-NEXT: s_mul_hi_u32 s10, s3, s6
-; GFX9-NEXT: s_addc_u32 s7, s7, s11
-; GFX9-NEXT: s_addc_u32 s8, s10, 0
-; GFX9-NEXT: s_mul_i32 s6, s3, s6
-; GFX9-NEXT: s_add_u32 s6, s7, s6
-; GFX9-NEXT: s_addc_u32 s7, 0, s8
+; GFX9-NEXT: s_add_u32 s7, s7, s9
+; GFX9-NEXT: s_mul_hi_u32 s10, s3, s8
+; GFX9-NEXT: s_addc_u32 s6, s6, s11
+; GFX9-NEXT: s_addc_u32 s7, s10, 0
+; GFX9-NEXT: s_mul_i32 s8, s3, s8
+; GFX9-NEXT: s_add_u32 s6, s6, s8
+; GFX9-NEXT: s_addc_u32 s7, 0, s7
; GFX9-NEXT: s_add_u32 s8, s6, 1
; GFX9-NEXT: s_addc_u32 s9, s7, 0
; GFX9-NEXT: s_add_u32 s10, s6, 2
@@ -8213,13 +8216,13 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
; GFX9-NEXT: s_cmp_lg_u32 s2, 0
; GFX9-NEXT: s_cselect_b32 s3, s3, s7
; GFX9-NEXT: s_cselect_b32 s2, s8, s6
-; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5]
-; GFX9-NEXT: s_sub_u32 s2, s2, s4
-; GFX9-NEXT: s_subb_u32 s3, s3, s4
+; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1]
+; GFX9-NEXT: s_sub_u32 s2, s2, s0
+; GFX9-NEXT: s_subb_u32 s3, s3, s0
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
%r = sdiv i64 %x, 1235195
store i64 %r, ptr addrspace(1) %out
@@ -8252,17 +8255,17 @@ define amdgpu_kernel void @sdiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
;
; GFX9-LABEL: sdiv_i64_pow2k_denom:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_ashr_i32 s4, s3, 31
-; GFX9-NEXT: s_lshr_b32 s4, s4, 20
-; GFX9-NEXT: s_add_u32 s2, s2, s4
-; GFX9-NEXT: s_addc_u32 s3, s3, 0
-; GFX9-NEXT: s_ashr_i64 s[2:3], s[2:3], 12
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_ashr_i32 s0, s7, 31
+; GFX9-NEXT: s_lshr_b32 s0, s0, 20
+; GFX9-NEXT: s_add_u32 s0, s6, s0
+; GFX9-NEXT: s_addc_u32 s1, s7, 0
+; GFX9-NEXT: s_ashr_i64 s[0:1], s[0:1], 12
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
%r = sdiv i64 %x, 4096
store i64 %r, ptr addrspace(1) %out
@@ -9518,100 +9521,100 @@ define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
;
; GFX9-LABEL: srem_i64_oddk_denom:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s4, 0x33fe64
-; GFX9-NEXT: s_add_u32 s4, 0x396, s4
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s0, 0x33fe64
+; GFX9-NEXT: s_add_u32 s0, 0x396, s0
; GFX9-NEXT: v_mov_b32_e32 v0, 0x28100000
-; GFX9-NEXT: s_addc_u32 s5, 0, 0
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0
+; GFX9-NEXT: s_addc_u32 s1, 0, 0
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT: s_addc_u32 s4, s5, 0xd95
-; GFX9-NEXT: v_readfirstlane_b32 s6, v0
-; GFX9-NEXT: s_mul_i32 s5, s4, 0xffed2705
-; GFX9-NEXT: s_mul_hi_u32 s7, s6, 0xffed2705
-; GFX9-NEXT: s_add_i32 s7, s7, s5
-; GFX9-NEXT: s_sub_i32 s5, s7, s6
-; GFX9-NEXT: s_mul_i32 s8, s6, 0xffed2705
-; GFX9-NEXT: s_mul_hi_u32 s11, s6, s5
-; GFX9-NEXT: s_mul_i32 s12, s6, s5
-; GFX9-NEXT: s_mul_hi_u32 s6, s6, s8
-; GFX9-NEXT: s_add_u32 s6, s6, s12
-; GFX9-NEXT: s_mul_hi_u32 s9, s4, s8
-; GFX9-NEXT: s_mul_i32 s10, s4, s8
+; GFX9-NEXT: s_addc_u32 s0, s1, 0xd95
+; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: s_mul_i32 s1, s0, 0xffed2705
+; GFX9-NEXT: s_mul_hi_u32 s3, s2, 0xffed2705
+; GFX9-NEXT: s_add_i32 s3, s3, s1
+; GFX9-NEXT: s_sub_i32 s1, s3, s2
+; GFX9-NEXT: s_mul_i32 s8, s2, 0xffed2705
+; GFX9-NEXT: s_mul_hi_u32 s11, s2, s1
+; GFX9-NEXT: s_mul_i32 s12, s2, s1
+; GFX9-NEXT: s_mul_hi_u32 s2, s2, s8
+; GFX9-NEXT: s_add_u32 s2, s2, s12
+; GFX9-NEXT: s_mul_hi_u32 s9, s0, s8
+; GFX9-NEXT: s_mul_i32 s10, s0, s8
; GFX9-NEXT: s_addc_u32 s8, 0, s11
-; GFX9-NEXT: s_add_u32 s6, s6, s10
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mul_hi_u32 s7, s4, s5
-; GFX9-NEXT: s_addc_u32 s6, s8, s9
-; GFX9-NEXT: s_addc_u32 s7, s7, 0
-; GFX9-NEXT: s_mul_i32 s5, s4, s5
-; GFX9-NEXT: s_add_u32 s5, s6, s5
-; GFX9-NEXT: s_addc_u32 s6, 0, s7
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s5, v0
+; GFX9-NEXT: s_add_u32 s2, s2, s10
+; GFX9-NEXT: s_mul_hi_u32 s3, s0, s1
+; GFX9-NEXT: s_addc_u32 s2, s8, s9
+; GFX9-NEXT: s_addc_u32 s3, s3, 0
+; GFX9-NEXT: s_mul_i32 s1, s0, s1
+; GFX9-NEXT: s_add_u32 s1, s2, s1
+; GFX9-NEXT: s_addc_u32 s2, 0, s3
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s1, v0
; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT: s_addc_u32 s6, s4, s6
+; GFX9-NEXT: s_addc_u32 s8, s0, s2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_ashr_i32 s4, s3, 31
-; GFX9-NEXT: s_add_u32 s2, s2, s4
-; GFX9-NEXT: s_mov_b32 s5, s4
-; GFX9-NEXT: s_addc_u32 s3, s3, s4
-; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5]
-; GFX9-NEXT: v_readfirstlane_b32 s8, v0
-; GFX9-NEXT: s_mul_i32 s7, s2, s6
-; GFX9-NEXT: s_mul_hi_u32 s9, s2, s8
-; GFX9-NEXT: s_mul_hi_u32 s5, s2, s6
-; GFX9-NEXT: s_add_u32 s7, s9, s7
-; GFX9-NEXT: s_addc_u32 s5, 0, s5
-; GFX9-NEXT: s_mul_hi_u32 s10, s3, s8
-; GFX9-NEXT: s_mul_i32 s8, s3, s8
-; GFX9-NEXT: s_add_u32 s7, s7, s8
-; GFX9-NEXT: s_mul_hi_u32 s9, s3, s6
-; GFX9-NEXT: s_addc_u32 s5, s5, s10
-; GFX9-NEXT: s_addc_u32 s7, s9, 0
-; GFX9-NEXT: s_mul_i32 s6, s3, s6
-; GFX9-NEXT: s_add_u32 s5, s5, s6
-; GFX9-NEXT: s_addc_u32 s6, 0, s7
-; GFX9-NEXT: s_mul_hi_u32 s8, s5, 0x12d8fb
-; GFX9-NEXT: s_mul_i32 s5, s5, 0x12d8fb
+; GFX9-NEXT: s_ashr_i32 s0, s7, 31
+; GFX9-NEXT: s_add_u32 s2, s6, s0
+; GFX9-NEXT: s_mov_b32 s1, s0
+; GFX9-NEXT: s_addc_u32 s3, s7, s0
+; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1]
+; GFX9-NEXT: v_readfirstlane_b32 s7, v0
+; GFX9-NEXT: s_mul_i32 s6, s2, s8
+; GFX9-NEXT: s_mul_hi_u32 s9, s2, s7
+; GFX9-NEXT: s_mul_hi_u32 s1, s2, s8
+; GFX9-NEXT: s_add_u32 s6, s9, s6
+; GFX9-NEXT: s_addc_u32 s1, 0, s1
+; GFX9-NEXT: s_mul_hi_u32 s10, s3, s7
+; GFX9-NEXT: s_mul_i32 s7, s3, s7
+; GFX9-NEXT: s_add_u32 s6, s6, s7
+; GFX9-NEXT: s_mul_hi_u32 s9, s3, s8
+; GFX9-NEXT: s_addc_u32 s1, s1, s10
+; GFX9-NEXT: s_addc_u32 s6, s9, 0
+; GFX9-NEXT: s_mul_i32 s7, s3, s8
+; GFX9-NEXT: s_add_u32 s1, s1, s7
+; GFX9-NEXT: s_addc_u32 s6, 0, s6
+; GFX9-NEXT: s_mul_hi_u32 s8, s1, 0x12d8fb
+; GFX9-NEXT: s_mul_i32 s1, s1, 0x12d8fb
; GFX9-NEXT: s_mul_i32 s6, s6, 0x12d8fb
-; GFX9-NEXT: v_mov_b32_e32 v0, s5
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: s_add_i32 s8, s8, s6
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0
; GFX9-NEXT: s_mov_b32 s7, 0x12d8fb
; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT: s_subb_u32 s2, s3, s8
+; GFX9-NEXT: s_subb_u32 s1, s3, s8
; GFX9-NEXT: v_subrev_co_u32_e32 v1, vcc, s7, v0
; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT: s_subb_u32 s3, s2, 0
+; GFX9-NEXT: s_subb_u32 s2, s1, 0
; GFX9-NEXT: v_subrev_co_u32_e32 v3, vcc, s7, v1
; GFX9-NEXT: s_cmp_lg_u64 vcc, 0
-; GFX9-NEXT: s_subb_u32 s5, s3, 0
+; GFX9-NEXT: s_subb_u32 s3, s2, 0
; GFX9-NEXT: s_mov_b32 s6, 0x12d8fa
; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s6, v1
-; GFX9-NEXT: s_cmp_eq_u32 s3, 0
+; GFX9-NEXT: s_cmp_eq_u32 s2, 0
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
; GFX9-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
-; GFX9-NEXT: v_mov_b32_e32 v5, s3
-; GFX9-NEXT: v_mov_b32_e32 v6, s5
+; GFX9-NEXT: v_mov_b32_e32 v5, s2
+; GFX9-NEXT: v_mov_b32_e32 v6, s3
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, s6, v0
-; GFX9-NEXT: s_cmp_eq_u32 s2, 0
+; GFX9-NEXT: s_cmp_eq_u32 s1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
; GFX9-NEXT: s_cselect_b64 vcc, -1, 0
; GFX9-NEXT: v_cndmask_b32_e32 v3, -1, v3, vcc
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; GFX9-NEXT: v_mov_b32_e32 v5, s2
+; GFX9-NEXT: v_mov_b32_e32 v5, s1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc
-; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0
-; GFX9-NEXT: v_xor_b32_e32 v1, s4, v3
-; GFX9-NEXT: v_mov_b32_e32 v3, s4
-; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s4, v0
+; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0
+; GFX9-NEXT: v_xor_b32_e32 v1, s0, v3
+; GFX9-NEXT: v_mov_b32_e32 v3, s0
+; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
%r = srem i64 %x, 1235195
store i64 %r, ptr addrspace(1) %out
@@ -9646,19 +9649,19 @@ define amdgpu_kernel void @srem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
;
; GFX9-LABEL: srem_i64_pow2k_denom:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_ashr_i32 s4, s3, 31
-; GFX9-NEXT: s_lshr_b32 s4, s4, 20
-; GFX9-NEXT: s_add_u32 s4, s2, s4
-; GFX9-NEXT: s_addc_u32 s5, s3, 0
-; GFX9-NEXT: s_and_b32 s4, s4, 0xfffff000
-; GFX9-NEXT: s_sub_u32 s2, s2, s4
-; GFX9-NEXT: s_subb_u32 s3, s3, s5
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_ashr_i32 s0, s7, 31
+; GFX9-NEXT: s_lshr_b32 s0, s0, 20
+; GFX9-NEXT: s_add_u32 s0, s6, s0
+; GFX9-NEXT: s_addc_u32 s1, s7, 0
+; GFX9-NEXT: s_and_b32 s0, s0, 0xfffff000
+; GFX9-NEXT: s_sub_u32 s0, s6, s0
+; GFX9-NEXT: s_subb_u32 s1, s7, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
%r = srem i64 %x, 4096
store i64 %r, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll
index d6137597293f6..c6233642110ea 100644
--- a/llvm/test/CodeGen/AMDGPU/and.ll
+++ b/llvm/test/CodeGen/AMDGPU/and.ll
@@ -227,10 +227,10 @@ define amdgpu_kernel void @s_and_32_bit_constant_i64(ptr addrspace(1) %out, i32,
; SI: s_load_dword [[B:s[0-9]+]]
; SI: s_load_dwordx2
; SI-NOT: and
-; SI: s_lshl_b32 [[A]], [[A]], 1
-; SI: s_lshl_b32 [[B]], [[B]], 1
-; SI: s_and_b32 s{{[0-9]+}}, [[A]], 62
-; SI: s_and_b32 s{{[0-9]+}}, [[B]], 62
+; SI: s_lshl_b32 [[C:s[0-9]+]], [[A]], 1
+; SI: s_lshl_b32 [[D:s[0-9]+]], [[B]], 1
+; SI: s_and_b32 s{{[0-9]+}}, [[C]], 62
+; SI: s_and_b32 s{{[0-9]+}}, [[D]], 62
; SI-NOT: and
; SI: buffer_store_dwordx2
define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(ptr addrspace(1) %out, i32, i64 %a, i32, i64 %b, i32, i64 %c) {
@@ -371,9 +371,9 @@ define amdgpu_kernel void @s_and_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad
; FUNC-LABEL: {{^}}s_and_inline_imm_64_i64_noshrink:
; SI: s_load_dword [[A:s[0-9]+]]
-; SI: s_lshl_b32 [[A]], [[A]], 1{{$}}
+; SI: s_lshl_b32 [[B:s[0-9]+]], [[A]], 1{{$}}
; SI-NOT: and
-; SI: s_and_b32 s{{[0-9]+}}, [[A]], 64
+; SI: s_and_b32 s{{[0-9]+}}, [[B]], 64
; SI-NOT: and
; SI: s_add_u32
; SI-NEXT: s_addc_u32
diff --git a/llvm/test/CodeGen/AMDGPU/anyext.ll b/llvm/test/CodeGen/AMDGPU/anyext.ll
index 897e134ee48d8..4617a53575131 100644
--- a/llvm/test/CodeGen/AMDGPU/anyext.ll
+++ b/llvm/test/CodeGen/AMDGPU/anyext.ll
@@ -22,17 +22,17 @@ define amdgpu_kernel void @anyext_i1_i32(ptr addrspace(1) %out, i32 %cond) #0 {
;
; GFX8-LABEL: anyext_i1_i32:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_cmp_eq_u32 s4, 0
-; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GFX8-NEXT: s_cmp_eq_u32 s2, 0
+; GFX8-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: anyext_i1_i32:
@@ -89,15 +89,15 @@ define amdgpu_kernel void @s_anyext_i16_i32(ptr addrspace(1) %out, ptr addrspace
; GFX8-LABEL: s_anyext_i16_i32:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v3, s7
; GFX8-NEXT: v_add_u32_e32 v2, vcc, s6, v0
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v1
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v2, v[2:3]
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index 624101dc12c5f..220fa5a1cc95b 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -63,13 +63,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB0_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s2
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -91,13 +91,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB0_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: add_i32_constant:
@@ -119,13 +119,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W64-NEXT: .LBB0_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_constant:
@@ -146,13 +147,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W32-NEXT: .LBB0_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_constant:
@@ -175,14 +177,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB0_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -206,14 +208,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc
; GFX11W32-NEXT: .LBB0_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -238,14 +240,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB0_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX12W64-NEXT: s_wait_kmcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -269,14 +271,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB0_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX12W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -336,14 +338,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB1_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -366,14 +368,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB1_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_add_u32_e32 v0, s2, v0
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: add_i32_uniform:
@@ -396,13 +398,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W64-NEXT: .LBB1_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3]
+; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v0, s[0:1]
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_uniform:
@@ -424,13 +427,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W32-NEXT: .LBB1_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s4, v1
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v0, s[4:5]
+; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1]
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_uniform:
@@ -454,14 +458,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB1_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3]
+; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[0:1]
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -486,14 +490,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
; GFX11W32-NEXT: .LBB1_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5]
+; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[0:1]
; GFX11W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -519,14 +523,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB1_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3]
+; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[0:1]
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -551,14 +555,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB1_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[4:5]
+; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[0:1]
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -612,13 +616,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB2_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -653,13 +657,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB2_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_add_u32_e32 v0, s2, v1
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: add_i32_varying_vdata:
@@ -693,13 +697,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: .LBB2_4:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W64-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_varying_vdata:
@@ -732,13 +737,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: .LBB2_4:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W32-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_varying_vdata:
@@ -774,14 +780,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB2_4:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -817,13 +823,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc
; GFX11W32-NEXT: .LBB2_4:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1
+; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s0, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -861,14 +867,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB2_4:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W64-NEXT: v_mov_b32_e32 v0, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX12W64-NEXT: s_wait_kmcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -904,13 +910,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB2_4:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1
+; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s0, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -969,13 +975,13 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX8-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc
; GFX8-NEXT: .LBB3_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1012,13 +1018,13 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX9-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc
; GFX9-NEXT: .LBB3_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_add_u32_e32 v0, s2, v1
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: struct_add_i32_varying_vdata:
@@ -1055,13 +1061,14 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX10W64-NEXT: .LBB3_4:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W64-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: struct_add_i32_varying_vdata:
@@ -1097,13 +1104,14 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX10W32-NEXT: .LBB3_4:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W32-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: struct_add_i32_varying_vdata:
@@ -1142,14 +1150,14 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX11W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], 0 idxen glc
; GFX11W64-NEXT: .LBB3_4:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -1188,13 +1196,13 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX11W32-NEXT: buffer_atomic_add_u32 v0, v2, s[4:7], 0 idxen glc
; GFX11W32-NEXT: .LBB3_4:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1
+; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s0, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -1235,14 +1243,14 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX12W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB3_4:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W64-NEXT: v_mov_b32_e32 v0, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX12W64-NEXT: s_wait_kmcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -1281,13 +1289,13 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX12W32-NEXT: buffer_atomic_add_u32 v0, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB3_4:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1
+; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s0, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -1315,12 +1323,12 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX8-LABEL: add_i32_varying_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v2, 1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_atomic_add v2, v0, s[4:7], 0 offen glc
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -1328,51 +1336,54 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX9-LABEL: add_i32_varying_offset:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 1
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: add_i32_varying_offset:
; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 1
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: add_i32_varying_offset:
; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 1
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 offen glc
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: add_i32_varying_offset:
; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 1
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1431,14 +1442,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB5_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1460,14 +1471,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB5_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: sub_i32_constant:
@@ -1489,14 +1500,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W64-NEXT: .LBB5_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: sub_i32_constant:
@@ -1517,14 +1529,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W32-NEXT: .LBB5_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_constant:
@@ -1547,15 +1560,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB5_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -1579,15 +1592,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], 0 glc
; GFX11W32-NEXT: .LBB5_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -1612,15 +1625,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB5_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX12W64-NEXT: s_wait_kmcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -1644,15 +1657,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB5_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -1712,14 +1725,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB6_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1742,14 +1755,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB6_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: sub_i32_uniform:
@@ -1772,14 +1785,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W64-NEXT: .LBB6_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: sub_i32_uniform:
@@ -1801,14 +1814,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W32-NEXT: .LBB6_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX10W32-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_uniform:
@@ -1832,15 +1845,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB6_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -1865,15 +1878,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
; GFX11W32-NEXT: .LBB6_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX11W32-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -1899,15 +1912,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB6_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX12W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -1932,15 +1945,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB6_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX12W32-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -1994,13 +2007,13 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB7_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -2035,13 +2048,13 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB7_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v1
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: sub_i32_varying_vdata:
@@ -2075,13 +2088,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: .LBB7_4:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W64-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: sub_i32_varying_vdata:
@@ -2114,13 +2128,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: .LBB7_4:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W32-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_varying_vdata:
@@ -2156,14 +2171,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB7_4:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -2199,14 +2214,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc
; GFX11W32-NEXT: .LBB7_4:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W32-NEXT: v_mov_b32_e32 v0, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -2244,14 +2259,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB7_4:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W64-NEXT: v_mov_b32_e32 v0, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX12W64-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX12W64-NEXT: s_wait_kmcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -2287,14 +2302,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB7_4:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W32-NEXT: v_mov_b32_e32 v0, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX12W32-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -2322,12 +2337,12 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX8-LABEL: sub_i32_varying_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v2, 1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_atomic_sub v2, v0, s[4:7], 0 offen glc
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -2335,51 +2350,54 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX9-LABEL: sub_i32_varying_offset:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 1
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sub_i32_varying_offset:
; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 1
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: sub_i32_varying_offset:
; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 1
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 offen glc
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: sub_i32_varying_offset:
; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 1
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 8ee0ee3b27bae..529af3d4e0378 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -48,243 +48,243 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
;
; GFX89-LABEL: add_i32_constant:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b64 s[6:7], exec
-; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX89-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b64 s[2:3], exec
+; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX89-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX89-NEXT: ; implicit-def: $vgpr1
-; GFX89-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX89-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX89-NEXT: s_cbranch_execz .LBB0_2
; GFX89-NEXT: ; %bb.1:
-; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[6:7]
+; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX89-NEXT: s_mul_i32 s2, s2, 5
; GFX89-NEXT: s_mov_b32 s11, 0xf000
; GFX89-NEXT: s_mov_b32 s10, -1
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_waitcnt lgkmcnt(0)
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: v_mov_b32_e32 v1, s2
; GFX89-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: buffer_wbinvl1_vol
; GFX89-NEXT: .LBB0_2:
-; GFX89-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX89-NEXT: v_readfirstlane_b32 s4, v1
+; GFX89-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX89-NEXT: v_readfirstlane_b32 s0, v1
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s3, 0xf000
-; GFX89-NEXT: s_mov_b32 s2, -1
-; GFX89-NEXT: v_mad_u32_u24 v0, v0, 5, s4
-; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX89-NEXT: s_mov_b32 s7, 0xf000
+; GFX89-NEXT: s_mov_b32 s6, -1
+; GFX89-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX89-NEXT: s_endpgm
;
; GFX1064-LABEL: add_i32_constant:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1064-NEXT: s_mov_b64 s[6:7], exec
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b64 s[2:3], exec
; GFX1064-NEXT: ; implicit-def: $vgpr1
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB0_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1064-NEXT: s_mul_i32 s6, s6, 5
+; GFX1064-NEXT: s_mul_i32 s2, s2, 5
; GFX1064-NEXT: s_mov_b32 s10, -1
-; GFX1064-NEXT: v_mov_b32_e32 v1, s6
+; GFX1064-NEXT: v_mov_b32_e32 v1, s2
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_mov_b32 s8, s2
-; GFX1064-NEXT: s_mov_b32 s9, s3
+; GFX1064-NEXT: s_mov_b32 s8, s6
+; GFX1064-NEXT: s_mov_b32 s9, s7
; GFX1064-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB0_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: add_i32_constant:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB0_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1032-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1032-NEXT: s_mul_i32 s5, s5, 5
+; GFX1032-NEXT: s_mul_i32 s1, s1, 5
; GFX1032-NEXT: s_mov_b32 s10, -1
-; GFX1032-NEXT: v_mov_b32_e32 v1, s5
+; GFX1032-NEXT: v_mov_b32_e32 v1, s1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_mov_b32 s8, s2
-; GFX1032-NEXT: s_mov_b32 s9, s3
+; GFX1032-NEXT: s_mov_b32 s8, s6
+; GFX1032-NEXT: s_mov_b32 s9, s7
; GFX1032-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB0_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: add_i32_constant:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1164-NEXT: s_mov_b64 s[6:7], exec
-; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1164-NEXT: ; implicit-def: $vgpr1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB0_2
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1164-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1164-NEXT: s_mul_i32 s6, s6, 5
+; GFX1164-NEXT: s_mul_i32 s2, s2, 5
; GFX1164-NEXT: s_mov_b32 s10, -1
-; GFX1164-NEXT: v_mov_b32_e32 v1, s6
+; GFX1164-NEXT: v_mov_b32_e32 v1, s2
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mov_b32 s8, s2
-; GFX1164-NEXT: s_mov_b32 s9, s3
+; GFX1164-NEXT: s_mov_b32 s8, s6
+; GFX1164-NEXT: s_mov_b32 s9, s7
; GFX1164-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: buffer_gl1_inv
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB0_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: add_i32_constant:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1132-NEXT: s_mov_b32 s5, exec_lo
-; GFX1132-NEXT: s_mov_b32 s4, exec_lo
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1132-NEXT: ; implicit-def: $vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB0_2
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1132-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1132-NEXT: s_mul_i32 s5, s5, 5
+; GFX1132-NEXT: s_mul_i32 s1, s1, 5
; GFX1132-NEXT: s_mov_b32 s10, -1
-; GFX1132-NEXT: v_mov_b32_e32 v1, s5
+; GFX1132-NEXT: v_mov_b32_e32 v1, s1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mov_b32 s8, s2
-; GFX1132-NEXT: s_mov_b32 s9, s3
+; GFX1132-NEXT: s_mov_b32 s8, s6
+; GFX1132-NEXT: s_mov_b32 s9, s7
; GFX1132-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: buffer_gl1_inv
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB0_2:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
;
; GFX1264-LABEL: add_i32_constant:
; GFX1264: ; %bb.0: ; %entry
-; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1264-NEXT: s_mov_b64 s[6:7], exec
-; GFX1264-NEXT: s_mov_b64 s[4:5], exec
-; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1264-NEXT: s_mov_b64 s[2:3], exec
+; GFX1264-NEXT: s_mov_b64 s[0:1], exec
+; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1264-NEXT: ; implicit-def: $vgpr1
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1264-NEXT: s_cbranch_execz .LBB0_2
; GFX1264-NEXT: ; %bb.1:
-; GFX1264-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1264-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1264-NEXT: s_mul_i32 s6, s6, 5
+; GFX1264-NEXT: s_mul_i32 s2, s2, 5
; GFX1264-NEXT: s_mov_b32 s10, -1
-; GFX1264-NEXT: v_mov_b32_e32 v1, s6
+; GFX1264-NEXT: v_mov_b32_e32 v1, s2
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: s_mov_b32 s8, s2
-; GFX1264-NEXT: s_mov_b32 s9, s3
+; GFX1264-NEXT: s_mov_b32 s8, s6
+; GFX1264-NEXT: s_mov_b32 s9, s7
; GFX1264-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX1264-NEXT: s_wait_loadcnt 0x0
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
; GFX1264-NEXT: .LBB0_2:
-; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1264-NEXT: v_readfirstlane_b32 s0, v1
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1264-NEXT: s_mov_b32 s6, -1
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX1264-NEXT: s_mov_b32 s2, -1
-; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1264-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX1264-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX1264-NEXT: s_nop 0
; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1264-NEXT: s_endpgm
;
; GFX1232-LABEL: add_i32_constant:
; GFX1232: ; %bb.0: ; %entry
-; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1232-NEXT: s_mov_b32 s5, exec_lo
-; GFX1232-NEXT: s_mov_b32 s4, exec_lo
-; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1232-NEXT: s_mov_b32 s2, exec_lo
+; GFX1232-NEXT: s_mov_b32 s0, exec_lo
+; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1232-NEXT: ; implicit-def: $vgpr1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1232-NEXT: s_cbranch_execz .LBB0_2
; GFX1232-NEXT: ; %bb.1:
-; GFX1232-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1232-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1232-NEXT: s_mul_i32 s5, s5, 5
+; GFX1232-NEXT: s_mul_i32 s1, s1, 5
; GFX1232-NEXT: s_mov_b32 s10, -1
-; GFX1232-NEXT: v_mov_b32_e32 v1, s5
+; GFX1232-NEXT: v_mov_b32_e32 v1, s1
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: s_mov_b32 s8, s2
-; GFX1232-NEXT: s_mov_b32 s9, s3
+; GFX1232-NEXT: s_mov_b32 s8, s6
+; GFX1232-NEXT: s_mov_b32 s9, s7
; GFX1232-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX1232-NEXT: s_wait_loadcnt 0x0
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
; GFX1232-NEXT: .LBB0_2:
-; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1232-NEXT: v_readfirstlane_b32 s0, v1
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s6, -1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX1232-NEXT: s_mov_b32 s2, -1
-; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1232-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX1232-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX1232-NEXT: s_nop 0
; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1232-NEXT: s_endpgm
@@ -647,280 +647,280 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8-LABEL: add_i32_varying:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_mov_b64 s[2:3], exec
-; GFX8-NEXT: s_mov_b32 s6, 0
+; GFX8-NEXT: s_mov_b32 s8, 0
; GFX8-NEXT: ; implicit-def: $vgpr1
; GFX8-NEXT: .LBB2_1: ; %ComputeLoop
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s4, s[2:3]
; GFX8-NEXT: s_mov_b32 m0, s4
-; GFX8-NEXT: v_readlane_b32 s7, v0, s4
+; GFX8-NEXT: v_readlane_b32 s6, v0, s4
; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4
-; GFX8-NEXT: v_writelane_b32 v1, s6, m0
-; GFX8-NEXT: s_add_i32 s6, s6, s7
+; GFX8-NEXT: v_writelane_b32 v1, s8, m0
+; GFX8-NEXT: s_add_i32 s8, s8, s6
; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB2_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr0
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX8-NEXT: s_cbranch_execz .LBB2_4
; GFX8-NEXT: ; %bb.3:
-; GFX8-NEXT: s_mov_b32 s11, 0xf000
-; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s15, 0xf000
+; GFX8-NEXT: s_mov_b32 s14, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s8, s2
-; GFX8-NEXT: s_mov_b32 s9, s3
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
+; GFX8-NEXT: s_mov_b32 s12, s6
+; GFX8-NEXT: s_mov_b32 s13, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s8
+; GFX8-NEXT: buffer_atomic_add v0, off, s[12:15], 0 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: .LBB2_4:
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v1
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v1
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: add_i32_varying:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: s_mov_b32 s6, 0
+; GFX9-NEXT: s_mov_b32 s8, 0
; GFX9-NEXT: ; implicit-def: $vgpr1
; GFX9-NEXT: .LBB2_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s4, s[2:3]
; GFX9-NEXT: s_mov_b32 m0, s4
-; GFX9-NEXT: v_readlane_b32 s7, v0, s4
+; GFX9-NEXT: v_readlane_b32 s6, v0, s4
; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4
-; GFX9-NEXT: v_writelane_b32 v1, s6, m0
-; GFX9-NEXT: s_add_i32 s6, s6, s7
+; GFX9-NEXT: v_writelane_b32 v1, s8, m0
+; GFX9-NEXT: s_add_i32 s8, s8, s6
; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB2_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr0
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX9-NEXT: s_cbranch_execz .LBB2_4
; GFX9-NEXT: ; %bb.3:
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_mov_b32 s15, 0xf000
+; GFX9-NEXT: s_mov_b32 s14, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: buffer_atomic_add v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: .LBB2_4:
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_add_u32_e32 v0, s4, v1
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v1
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: add_i32_varying:
; GFX1064: ; %bb.0: ; %entry
; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: s_mov_b32 s6, 0
+; GFX1064-NEXT: s_mov_b32 s8, 0
; GFX1064-NEXT: ; implicit-def: $vgpr1
; GFX1064-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_ff1_i32_b64 s7, s[2:3]
-; GFX1064-NEXT: v_readlane_b32 s8, v0, s7
-; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s7
-; GFX1064-NEXT: v_writelane_b32 v1, s6, s7
+; GFX1064-NEXT: s_ff1_i32_b64 s6, s[2:3]
+; GFX1064-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s6
+; GFX1064-NEXT: v_writelane_b32 v1, s8, s6
; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GFX1064-NEXT: s_add_i32 s6, s6, s8
+; GFX1064-NEXT: s_add_i32 s8, s8, s7
; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX1064-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execz .LBB2_4
; GFX1064-NEXT: ; %bb.3:
-; GFX1064-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064-NEXT: v_mov_b32_e32 v0, s8
; GFX1064-NEXT: s_mov_b32 s11, 0x31016000
; GFX1064-NEXT: s_mov_b32 s10, -1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_mov_b32 s8, s2
-; GFX1064-NEXT: s_mov_b32 s9, s3
+; GFX1064-NEXT: s_mov_b32 s8, s6
+; GFX1064-NEXT: s_mov_b32 s9, s7
; GFX1064-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB2_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_add_nc_u32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_add_nc_u32_e32 v0, s0, v1
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: add_i32_varying:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_mov_b32 s2, exec_lo
-; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: ; implicit-def: $vgpr1
; GFX1032-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_ff1_i32_b32 s3, s2
-; GFX1032-NEXT: v_readlane_b32 s5, v0, s3
-; GFX1032-NEXT: s_lshl_b32 s6, 1, s3
-; GFX1032-NEXT: v_writelane_b32 v1, s4, s3
-; GFX1032-NEXT: s_andn2_b32 s2, s2, s6
-; GFX1032-NEXT: s_add_i32 s4, s4, s5
-; GFX1032-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032-NEXT: s_lshl_b32 s6, 1, s4
+; GFX1032-NEXT: v_writelane_b32 v1, s2, s4
+; GFX1032-NEXT: s_andn2_b32 s3, s3, s6
+; GFX1032-NEXT: s_add_i32 s2, s2, s5
+; GFX1032-NEXT: s_cmp_lg_u32 s3, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: ; implicit-def: $vgpr0
-; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo
-; GFX1032-NEXT: s_xor_b32 s5, exec_lo, s5
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1032-NEXT: s_cbranch_execz .LBB2_4
; GFX1032-NEXT: ; %bb.3:
-; GFX1032-NEXT: v_mov_b32_e32 v0, s4
+; GFX1032-NEXT: v_mov_b32_e32 v0, s2
; GFX1032-NEXT: s_mov_b32 s11, 0x31016000
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_mov_b32 s8, s2
-; GFX1032-NEXT: s_mov_b32 s9, s3
+; GFX1032-NEXT: s_mov_b32 s8, s6
+; GFX1032-NEXT: s_mov_b32 s9, s7
; GFX1032-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB2_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_add_nc_u32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_add_nc_u32_e32 v0, s0, v1
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: add_i32_varying:
; GFX1164: ; %bb.0: ; %entry
; GFX1164-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164-NEXT: s_mov_b32 s6, 0
+; GFX1164-NEXT: s_mov_b32 s8, 0
; GFX1164-NEXT: ; implicit-def: $vgpr1
; GFX1164-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1164-NEXT: s_ctz_i32_b64 s6, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_readlane_b32 s8, v0, s7
-; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s7
-; GFX1164-NEXT: v_writelane_b32 v1, s6, s7
+; GFX1164-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s6
+; GFX1164-NEXT: v_writelane_b32 v1, s8, s6
; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5]
-; GFX1164-NEXT: s_add_i32 s6, s6, s8
+; GFX1164-NEXT: s_add_i32 s8, s8, s7
; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1164-NEXT: ; implicit-def: $vgpr0
-; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execz .LBB2_4
; GFX1164-NEXT: ; %bb.3:
-; GFX1164-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164-NEXT: v_mov_b32_e32 v0, s8
; GFX1164-NEXT: s_mov_b32 s11, 0x31016000
; GFX1164-NEXT: s_mov_b32 s10, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mov_b32 s8, s2
-; GFX1164-NEXT: s_mov_b32 s9, s3
+; GFX1164-NEXT: s_mov_b32 s8, s6
+; GFX1164-NEXT: s_mov_b32 s9, s7
; GFX1164-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: buffer_gl1_inv
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB2_4:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: v_add_nc_u32_e32 v0, s0, v1
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: add_i32_varying:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s2, 0
; GFX1132-NEXT: ; implicit-def: $vgpr1
; GFX1132-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132-NEXT: s_ctz_i32_b32 s4, s3
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1132-NEXT: v_readlane_b32 s5, v0, s3
-; GFX1132-NEXT: s_lshl_b32 s6, 1, s3
-; GFX1132-NEXT: v_writelane_b32 v1, s4, s3
-; GFX1132-NEXT: s_and_not1_b32 s2, s2, s6
-; GFX1132-NEXT: s_add_i32 s4, s4, s5
-; GFX1132-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132-NEXT: s_lshl_b32 s6, 1, s4
+; GFX1132-NEXT: v_writelane_b32 v1, s2, s4
+; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6
+; GFX1132-NEXT: s_add_i32 s2, s2, s5
+; GFX1132-NEXT: s_cmp_lg_u32 s3, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132-NEXT: ; implicit-def: $vgpr0
-; GFX1132-NEXT: s_and_saveexec_b32 s5, vcc_lo
-; GFX1132-NEXT: s_xor_b32 s5, exec_lo, s5
+; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1132-NEXT: s_cbranch_execz .LBB2_4
; GFX1132-NEXT: ; %bb.3:
-; GFX1132-NEXT: v_mov_b32_e32 v0, s4
+; GFX1132-NEXT: v_mov_b32_e32 v0, s2
; GFX1132-NEXT: s_mov_b32 s11, 0x31016000
; GFX1132-NEXT: s_mov_b32 s10, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mov_b32 s8, s2
-; GFX1132-NEXT: s_mov_b32 s9, s3
+; GFX1132-NEXT: s_mov_b32 s8, s6
+; GFX1132-NEXT: s_mov_b32 s9, s7
; GFX1132-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: buffer_gl1_inv
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB2_4:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: v_add_nc_u32_e32 v0, s0, v1
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -928,97 +928,97 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-LABEL: add_i32_varying:
; GFX1264: ; %bb.0: ; %entry
; GFX1264-NEXT: s_mov_b64 s[2:3], exec
-; GFX1264-NEXT: s_mov_b32 s6, 0
+; GFX1264-NEXT: s_mov_b32 s8, 0
; GFX1264-NEXT: ; implicit-def: $vgpr1
; GFX1264-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1264-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1264-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1264-NEXT: s_ctz_i32_b64 s6, s[2:3]
; GFX1264-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1264-NEXT: v_readlane_b32 s8, v0, s7
-; GFX1264-NEXT: s_lshl_b64 s[4:5], 1, s7
-; GFX1264-NEXT: v_writelane_b32 v1, s6, s7
+; GFX1264-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1264-NEXT: s_lshl_b64 s[4:5], 1, s6
+; GFX1264-NEXT: v_writelane_b32 v1, s8, s6
; GFX1264-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5]
-; GFX1264-NEXT: s_add_co_i32 s6, s6, s8
+; GFX1264-NEXT: s_add_co_i32 s8, s8, s7
; GFX1264-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1264-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1264-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1264-NEXT: ; implicit-def: $vgpr0
-; GFX1264-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1264-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1264-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1264-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1264-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX1264-NEXT: s_cbranch_execz .LBB2_4
; GFX1264-NEXT: ; %bb.3:
-; GFX1264-NEXT: v_mov_b32_e32 v0, s6
+; GFX1264-NEXT: v_mov_b32_e32 v0, s8
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
; GFX1264-NEXT: s_mov_b32 s10, -1
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: s_mov_b32 s8, s2
-; GFX1264-NEXT: s_mov_b32 s9, s3
+; GFX1264-NEXT: s_mov_b32 s8, s6
+; GFX1264-NEXT: s_mov_b32 s9, s7
; GFX1264-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX1264-NEXT: s_wait_loadcnt 0x0
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
; GFX1264-NEXT: .LBB2_4:
-; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1264-NEXT: v_readfirstlane_b32 s0, v0
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1264-NEXT: s_mov_b32 s6, -1
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-NEXT: v_add_nc_u32_e32 v0, s2, v1
-; GFX1264-NEXT: s_mov_b32 s2, -1
-; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1264-NEXT: v_add_nc_u32_e32 v0, s0, v1
+; GFX1264-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX1264-NEXT: s_nop 0
; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1264-NEXT: s_endpgm
;
; GFX1232-LABEL: add_i32_varying:
; GFX1232: ; %bb.0: ; %entry
-; GFX1232-NEXT: s_mov_b32 s2, exec_lo
-; GFX1232-NEXT: s_mov_b32 s4, 0
+; GFX1232-NEXT: s_mov_b32 s3, exec_lo
+; GFX1232-NEXT: s_mov_b32 s2, 0
; GFX1232-NEXT: ; implicit-def: $vgpr1
; GFX1232-NEXT: .LBB2_1: ; %ComputeLoop
; GFX1232-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1232-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1232-NEXT: s_ctz_i32_b32 s4, s3
; GFX1232-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1232-NEXT: v_readlane_b32 s5, v0, s3
-; GFX1232-NEXT: s_lshl_b32 s6, 1, s3
-; GFX1232-NEXT: v_writelane_b32 v1, s4, s3
-; GFX1232-NEXT: s_and_not1_b32 s2, s2, s6
-; GFX1232-NEXT: s_add_co_i32 s4, s4, s5
-; GFX1232-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1232-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1232-NEXT: s_lshl_b32 s6, 1, s4
+; GFX1232-NEXT: v_writelane_b32 v1, s2, s4
+; GFX1232-NEXT: s_and_not1_b32 s3, s3, s6
+; GFX1232-NEXT: s_add_co_i32 s2, s2, s5
+; GFX1232-NEXT: s_cmp_lg_u32 s3, 0
; GFX1232-NEXT: s_cbranch_scc1 .LBB2_1
; GFX1232-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1232-NEXT: ; implicit-def: $vgpr0
-; GFX1232-NEXT: s_and_saveexec_b32 s5, vcc_lo
-; GFX1232-NEXT: s_xor_b32 s5, exec_lo, s5
+; GFX1232-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1232-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1232-NEXT: s_cbranch_execz .LBB2_4
; GFX1232-NEXT: ; %bb.3:
-; GFX1232-NEXT: v_mov_b32_e32 v0, s4
+; GFX1232-NEXT: v_mov_b32_e32 v0, s2
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
; GFX1232-NEXT: s_mov_b32 s10, -1
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: s_mov_b32 s8, s2
-; GFX1232-NEXT: s_mov_b32 s9, s3
+; GFX1232-NEXT: s_mov_b32 s8, s6
+; GFX1232-NEXT: s_mov_b32 s9, s7
; GFX1232-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX1232-NEXT: s_wait_loadcnt 0x0
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
; GFX1232-NEXT: .LBB2_4:
-; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1232-NEXT: v_readfirstlane_b32 s0, v0
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s6, -1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-NEXT: v_add_nc_u32_e32 v0, s2, v1
-; GFX1232-NEXT: s_mov_b32 s2, -1
-; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1232-NEXT: v_add_nc_u32_e32 v0, s0, v1
+; GFX1232-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX1232-NEXT: s_nop 0
; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1232-NEXT: s_endpgm
@@ -1071,260 +1071,259 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
;
; GFX89-LABEL: add_i64_constant:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b64 s[6:7], exec
-; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX89-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b64 s[2:3], exec
+; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX89-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX89-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX89-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX89-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX89-NEXT: s_cbranch_execz .LBB3_2
; GFX89-NEXT: ; %bb.1:
-; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[6:7]
+; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX89-NEXT: s_mul_i32 s2, s2, 5
; GFX89-NEXT: s_mov_b32 s11, 0xf000
; GFX89-NEXT: s_mov_b32 s10, -1
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_waitcnt lgkmcnt(0)
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: v_mov_b32_e32 v0, s2
; GFX89-NEXT: v_mov_b32_e32 v1, 0
; GFX89-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: buffer_wbinvl1_vol
; GFX89-NEXT: .LBB3_2:
-; GFX89-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX89-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX89-NEXT: v_readfirstlane_b32 s0, v0
+; GFX89-NEXT: v_readfirstlane_b32 s1, v1
+; GFX89-NEXT: v_mov_b32_e32 v0, s0
+; GFX89-NEXT: v_mov_b32_e32 v1, s1
+; GFX89-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 5, v[0:1]
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: v_readfirstlane_b32 s2, v0
-; GFX89-NEXT: v_readfirstlane_b32 s3, v1
-; GFX89-NEXT: v_mov_b32_e32 v0, s2
-; GFX89-NEXT: v_mov_b32_e32 v1, s3
-; GFX89-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
-; GFX89-NEXT: s_mov_b32 s3, 0xf000
-; GFX89-NEXT: s_mov_b32 s2, -1
-; GFX89-NEXT: s_nop 2
-; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX89-NEXT: s_mov_b32 s7, 0xf000
+; GFX89-NEXT: s_mov_b32 s6, -1
+; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX89-NEXT: s_endpgm
;
; GFX1064-LABEL: add_i64_constant:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1064-NEXT: s_mov_b64 s[6:7], exec
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB3_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-NEXT: s_mul_i32 s6, s6, 5
+; GFX1064-NEXT: s_mul_i32 s2, s2, 5
; GFX1064-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1064-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064-NEXT: v_mov_b32_e32 v0, s2
; GFX1064-NEXT: s_mov_b32 s10, -1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_mov_b32 s8, s2
-; GFX1064-NEXT: s_mov_b32 s9, s3
+; GFX1064-NEXT: s_mov_b32 s8, s6
+; GFX1064-NEXT: s_mov_b32 s9, s7
; GFX1064-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB3_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
-; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3]
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 5, s[0:1]
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: add_i64_constant:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB3_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-NEXT: s_mul_i32 s5, s5, 5
+; GFX1032-NEXT: s_mul_i32 s1, s1, 5
; GFX1032-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1032-NEXT: v_mov_b32_e32 v0, s5
+; GFX1032-NEXT: v_mov_b32_e32 v0, s1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_mov_b32 s8, s2
-; GFX1032-NEXT: s_mov_b32 s9, s3
+; GFX1032-NEXT: s_mov_b32 s8, s6
+; GFX1032-NEXT: s_mov_b32 s9, s7
; GFX1032-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB3_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
-; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3]
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, v2, 5, s[0:1]
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: add_i64_constant:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1164-NEXT: s_mov_b64 s[6:7], exec
-; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1164-NEXT: s_cbranch_execz .LBB3_2
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: s_mul_i32 s6, s6, 5
+; GFX1164-NEXT: s_mul_i32 s2, s2, 5
; GFX1164-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1164-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164-NEXT: v_mov_b32_e32 v0, s2
; GFX1164-NEXT: s_mov_b32 s10, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mov_b32 s8, s2
-; GFX1164-NEXT: s_mov_b32 s9, s3
+; GFX1164-NEXT: s_mov_b32 s8, s6
+; GFX1164-NEXT: s_mov_b32 s9, s7
; GFX1164-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: buffer_gl1_inv
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB3_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[0:1]
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: add_i64_constant:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1132-NEXT: s_mov_b32 s5, exec_lo
-; GFX1132-NEXT: s_mov_b32 s4, exec_lo
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0
+; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0
; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1132-NEXT: s_cbranch_execz .LBB3_2
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1132-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1132-NEXT: s_mul_i32 s5, s5, 5
+; GFX1132-NEXT: s_mul_i32 s1, s1, 5
; GFX1132-NEXT: s_mov_b32 s10, -1
-; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0
+; GFX1132-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mov_b32 s8, s2
-; GFX1132-NEXT: s_mov_b32 s9, s3
+; GFX1132-NEXT: s_mov_b32 s8, s6
+; GFX1132-NEXT: s_mov_b32 s9, s7
; GFX1132-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: buffer_gl1_inv
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB3_2:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[0:1]
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
;
; GFX1264-LABEL: add_i64_constant:
; GFX1264: ; %bb.0: ; %entry
-; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1264-NEXT: s_mov_b64 s[6:7], exec
+; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1264-NEXT: s_mov_b64 s[2:3], exec
; GFX1264-NEXT: s_mov_b32 s9, 0
-; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1264-NEXT: s_mov_b64 s[4:5], exec
+; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1264-NEXT: s_mov_b64 s[0:1], exec
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1264-NEXT: s_cbranch_execz .LBB3_2
; GFX1264-NEXT: ; %bb.1:
-; GFX1264-NEXT: s_bcnt1_i32_b64 s8, s[6:7]
+; GFX1264-NEXT: s_bcnt1_i32_b64 s8, s[2:3]
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1264-NEXT: s_mul_u64 s[6:7], s[8:9], 5
+; GFX1264-NEXT: s_mul_u64 s[2:3], s[8:9], 5
; GFX1264-NEXT: s_mov_b32 s10, -1
-; GFX1264-NEXT: v_mov_b32_e32 v0, s6
-; GFX1264-NEXT: v_mov_b32_e32 v1, s7
+; GFX1264-NEXT: v_mov_b32_e32 v0, s2
+; GFX1264-NEXT: v_mov_b32_e32 v1, s3
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: s_mov_b32 s8, s2
-; GFX1264-NEXT: s_mov_b32 s9, s3
+; GFX1264-NEXT: s_mov_b32 s8, s6
+; GFX1264-NEXT: s_mov_b32 s9, s7
; GFX1264-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX1264-NEXT: s_wait_loadcnt 0x0
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
; GFX1264-NEXT: .LBB3_2:
-; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1264-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1264-NEXT: v_readfirstlane_b32 s1, v1
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1264-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1264-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1264-NEXT: s_mov_b32 s6, -1
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, 5, s[2:3]
-; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1264-NEXT: s_mov_b32 s2, -1
-; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, 5, s[0:1]
+; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
; GFX1264-NEXT: s_nop 0
; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1264-NEXT: s_endpgm
;
; GFX1232-LABEL: add_i64_constant:
; GFX1232: ; %bb.0: ; %entry
-; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1232-NEXT: s_mov_b32 s4, exec_lo
-; GFX1232-NEXT: s_mov_b32 s5, 0
-; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s4, 0
-; GFX1232-NEXT: s_mov_b32 s6, exec_lo
+; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1232-NEXT: s_mov_b32 s3, exec_lo
+; GFX1232-NEXT: s_mov_b32 s1, 0
+; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0
+; GFX1232-NEXT: s_mov_b32 s2, exec_lo
; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1232-NEXT: s_cbranch_execz .LBB3_2
; GFX1232-NEXT: ; %bb.1:
-; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s4
+; GFX1232-NEXT: s_bcnt1_i32_b32 s0, s3
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1232-NEXT: s_mul_u64 s[4:5], s[4:5], 5
+; GFX1232-NEXT: s_mul_u64 s[0:1], s[0:1], 5
; GFX1232-NEXT: s_mov_b32 s10, -1
-; GFX1232-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX1232-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: s_mov_b32 s8, s2
-; GFX1232-NEXT: s_mov_b32 s9, s3
+; GFX1232-NEXT: s_mov_b32 s8, s6
+; GFX1232-NEXT: s_mov_b32 s9, s7
; GFX1232-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX1232-NEXT: s_wait_loadcnt 0x0
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
; GFX1232-NEXT: .LBB3_2:
-; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6
+; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1232-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1232-NEXT: v_readfirstlane_b32 s1, v1
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1232-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s6, -1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, 5, s[2:3]
-; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1232-NEXT: s_mov_b32 s2, -1
-; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, 5, s[0:1]
+; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
; GFX1232-NEXT: s_nop 0
; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1232-NEXT: s_endpgm
@@ -1383,21 +1382,21 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX8-LABEL: add_i64_uniform:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: s_mov_b64 s[8:9], exec
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8-NEXT: s_cbranch_execz .LBB4_2
; GFX8-NEXT: ; %bb.1:
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 s12, s6
; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[8:9]
; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s0, v0, 0
-; GFX8-NEXT: s_mul_i32 s6, s1, s6
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s2, v0, 0
+; GFX8-NEXT: s_mul_i32 s6, s3, s6
; GFX8-NEXT: s_mov_b32 s15, 0xf000
; GFX8-NEXT: s_mov_b32 s14, -1
; GFX8-NEXT: s_mov_b32 s13, s7
@@ -1406,14 +1405,14 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: .LBB4_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8-NEXT: v_readfirstlane_b32 s3, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mul_lo_u32 v3, s1, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s0, v2, v[0:1]
+; GFX8-NEXT: v_mul_lo_u32 v3, s3, v2
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v2, v[0:1]
; GFX8-NEXT: s_mov_b32 s7, 0xf000
; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
@@ -1548,9 +1547,9 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1164: ; %bb.0: ; %entry
; GFX1164-NEXT: s_clause 0x1
; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1164-NEXT: s_mov_b64 s[8:9], exec
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0
@@ -1561,9 +1560,9 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1164-NEXT: s_bcnt1_i32_b64 s8, s[8:9]
; GFX1164-NEXT: s_mov_b32 s11, 0x31016000
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mul_i32 s9, s1, s8
-; GFX1164-NEXT: s_mul_hi_u32 s10, s0, s8
-; GFX1164-NEXT: s_mul_i32 s8, s0, s8
+; GFX1164-NEXT: s_mul_i32 s9, s3, s8
+; GFX1164-NEXT: s_mul_hi_u32 s10, s2, s8
+; GFX1164-NEXT: s_mul_i32 s8, s2, s8
; GFX1164-NEXT: s_add_i32 s10, s10, s9
; GFX1164-NEXT: v_mov_b32_e32 v0, s8
; GFX1164-NEXT: v_mov_b32_e32 v1, s10
@@ -1575,15 +1574,15 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1164-NEXT: buffer_gl1_inv
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB4_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s0, v2, s[2:3]
-; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s1, v2, v[1:2]
+; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[0:1]
+; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2]
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v1, v3
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
@@ -1595,24 +1594,24 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1132: ; %bb.0: ; %entry
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX1132-NEXT: s_mov_b32 s8, exec_lo
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0
; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1132-NEXT: s_cbranch_execz .LBB4_2
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s8
; GFX1132-NEXT: s_mov_b32 s11, 0x31016000
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mul_i32 s8, s1, s3
-; GFX1132-NEXT: s_mul_hi_u32 s9, s0, s3
-; GFX1132-NEXT: s_mul_i32 s3, s0, s3
+; GFX1132-NEXT: s_mul_i32 s8, s3, s1
+; GFX1132-NEXT: s_mul_hi_u32 s9, s2, s1
+; GFX1132-NEXT: s_mul_i32 s1, s2, s1
; GFX1132-NEXT: s_add_i32 s9, s9, s8
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s9
+; GFX1132-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s9
; GFX1132-NEXT: s_mov_b32 s10, -1
; GFX1132-NEXT: s_mov_b32 s8, s6
; GFX1132-NEXT: s_mov_b32 s9, s7
@@ -1621,15 +1620,15 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1132-NEXT: buffer_gl1_inv
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB4_2:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s0, v2, s[2:3]
-; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s1, v2, v[1:2]
+; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[0:1]
+; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2]
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_mov_b32_e32 v1, v3
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
@@ -1641,11 +1640,11 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264: ; %bb.0: ; %entry
; GFX1264-NEXT: s_clause 0x1
; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1264-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1264-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1264-NEXT: s_mov_b64 s[8:9], exec
; GFX1264-NEXT: s_mov_b32 s11, 0
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX1264-NEXT: s_mov_b64 s[2:3], exec
+; GFX1264-NEXT: s_mov_b64 s[0:1], exec
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0
; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -1654,7 +1653,7 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: ; %bb.1:
; GFX1264-NEXT: s_bcnt1_i32_b64 s10, s[8:9]
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: s_mul_u64 s[8:9], s[0:1], s[10:11]
+; GFX1264-NEXT: s_mul_u64 s[8:9], s[2:3], s[10:11]
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
; GFX1264-NEXT: v_mov_b32_e32 v0, s8
; GFX1264-NEXT: v_mov_b32_e32 v1, s9
@@ -1665,15 +1664,15 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: s_wait_loadcnt 0x0
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
; GFX1264-NEXT: .LBB4_2:
-; GFX1264-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1264-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1264-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1264-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1264-NEXT: v_readfirstlane_b32 s1, v1
; GFX1264-NEXT: s_wait_kmcnt 0x0
; GFX1264-NEXT: s_mov_b32 s7, 0x31016000
; GFX1264-NEXT: s_mov_b32 s6, -1
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v2, s[2:3]
-; GFX1264-NEXT: v_mad_co_u64_u32 v[1:2], null, s1, v2, v[1:2]
+; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v2, s[0:1]
+; GFX1264-NEXT: v_mad_co_u64_u32 v[1:2], null, s3, v2, v[1:2]
; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
; GFX1264-NEXT: s_nop 0
; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1683,22 +1682,22 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232: ; %bb.0: ; %entry
; GFX1232-NEXT: s_clause 0x1
; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1232-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX1232-NEXT: s_mov_b32 s2, exec_lo
-; GFX1232-NEXT: s_mov_b32 s3, 0
-; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0
+; GFX1232-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX1232-NEXT: s_mov_b32 s9, exec_lo
+; GFX1232-NEXT: s_mov_b32 s1, 0
+; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s9, 0
; GFX1232-NEXT: s_mov_b32 s8, exec_lo
; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1232-NEXT: s_cbranch_execz .LBB4_2
; GFX1232-NEXT: ; %bb.1:
-; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1232-NEXT: s_bcnt1_i32_b32 s0, s9
; GFX1232-NEXT: s_mov_b32 s15, 0x31016000
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: s_mul_u64 s[2:3], s[0:1], s[2:3]
+; GFX1232-NEXT: s_mul_u64 s[0:1], s[2:3], s[0:1]
; GFX1232-NEXT: s_mov_b32 s14, -1
-; GFX1232-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1232-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX1232-NEXT: s_mov_b32 s12, s6
; GFX1232-NEXT: s_mov_b32 s13, s7
; GFX1232-NEXT: buffer_atomic_add_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN
@@ -1706,14 +1705,14 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
; GFX1232-NEXT: .LBB4_2:
; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s8
-; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1232-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1232-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1232-NEXT: v_readfirstlane_b32 s1, v1
; GFX1232-NEXT: s_wait_kmcnt 0x0
; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
; GFX1232-NEXT: s_mov_b32 s6, -1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v2, s[2:3]
-; GFX1232-NEXT: v_mad_co_u64_u32 v[1:2], null, s1, v2, v[1:2]
+; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v2, s[0:1]
+; GFX1232-NEXT: v_mad_co_u64_u32 v[1:2], null, s3, v2, v[1:2]
; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
; GFX1232-NEXT: s_nop 0
; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1746,82 +1745,82 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
;
; GFX89-LABEL: add_i64_varying:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: v_mov_b32_e32 v1, 0
; GFX89-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: buffer_wbinvl1_vol
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
-; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
+; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX10-LABEL: add_i64_varying:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s11, s7
-; GFX10-NEXT: s_mov_b32 s10, s6
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s11, s3
+; GFX10-NEXT: s_mov_b32 s10, s2
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
-; GFX10-NEXT: s_mov_b32 s4, s0
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s0, s4
; GFX10-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_mov_b32 s5, s1
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX10-NEXT: s_mov_b32 s1, s5
+; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: add_i64_varying:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_mov_b32 s10, s6
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: s_mov_b32 s1, s5
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: add_i64_varying:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-NEXT: s_mov_b32 s7, 0x31016000
-; GFX12-NEXT: s_mov_b32 s6, -1
-; GFX12-NEXT: s_mov_b32 s11, s7
-; GFX12-NEXT: s_mov_b32 s10, s6
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s11, s3
+; GFX12-NEXT: s_mov_b32 s10, s2
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s8, s2
-; GFX12-NEXT: s_mov_b32 s9, s3
-; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s8, s6
+; GFX12-NEXT: s_mov_b32 s9, s7
+; GFX12-NEXT: s_mov_b32 s0, s4
; GFX12-NEXT: buffer_atomic_add_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_mov_b32 s5, s1
-; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
+; GFX12-NEXT: s_mov_b32 s1, s5
+; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1869,283 +1868,283 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
;
; GFX8-LABEL: sub_i32_constant:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b64 s[6:7], exec
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b64 s[2:3], exec
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8-NEXT: s_cbranch_execz .LBB6_2
; GFX8-NEXT: ; %bb.1:
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s8, s2
-; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7]
+; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX8-NEXT: s_mul_i32 s2, s2, 5
; GFX8-NEXT: s_mov_b32 s11, 0xf000
; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s9, s3
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s8, s6
+; GFX8-NEXT: s_mov_b32 s9, s7
; GFX8-NEXT: v_mov_b32_e32 v1, s2
; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: .LBB6_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_readfirstlane_b32 s4, v1
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i32_constant:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[6:7], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9-NEXT: s_cbranch_execz .LBB6_2
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7]
+; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX9-NEXT: s_mul_i32 s2, s2, 5
; GFX9-NEXT: s_mov_b32 s11, 0xf000
; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: .LBB6_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: v_readfirstlane_b32 s4, v1
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: sub_i32_constant:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1064-NEXT: s_mov_b64 s[6:7], exec
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b64 s[2:3], exec
; GFX1064-NEXT: ; implicit-def: $vgpr1
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB6_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1064-NEXT: s_mul_i32 s6, s6, 5
+; GFX1064-NEXT: s_mul_i32 s2, s2, 5
; GFX1064-NEXT: s_mov_b32 s10, -1
-; GFX1064-NEXT: v_mov_b32_e32 v1, s6
+; GFX1064-NEXT: v_mov_b32_e32 v1, s2
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_mov_b32 s8, s2
-; GFX1064-NEXT: s_mov_b32 s9, s3
+; GFX1064-NEXT: s_mov_b32 s8, s6
+; GFX1064-NEXT: s_mov_b32 s9, s7
; GFX1064-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB6_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v1
; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: sub_i32_constant:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB6_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1032-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1032-NEXT: s_mul_i32 s5, s5, 5
+; GFX1032-NEXT: s_mul_i32 s1, s1, 5
; GFX1032-NEXT: s_mov_b32 s10, -1
-; GFX1032-NEXT: v_mov_b32_e32 v1, s5
+; GFX1032-NEXT: v_mov_b32_e32 v1, s1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_mov_b32 s8, s2
-; GFX1032-NEXT: s_mov_b32 s9, s3
+; GFX1032-NEXT: s_mov_b32 s8, s6
+; GFX1032-NEXT: s_mov_b32 s9, s7
; GFX1032-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB6_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v1
; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: sub_i32_constant:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1164-NEXT: s_mov_b64 s[6:7], exec
-; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1164-NEXT: ; implicit-def: $vgpr1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB6_2
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1164-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1164-NEXT: s_mul_i32 s6, s6, 5
+; GFX1164-NEXT: s_mul_i32 s2, s2, 5
; GFX1164-NEXT: s_mov_b32 s10, -1
-; GFX1164-NEXT: v_mov_b32_e32 v1, s6
+; GFX1164-NEXT: v_mov_b32_e32 v1, s2
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mov_b32 s8, s2
-; GFX1164-NEXT: s_mov_b32 s9, s3
+; GFX1164-NEXT: s_mov_b32 s8, s6
+; GFX1164-NEXT: s_mov_b32 s9, s7
; GFX1164-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: buffer_gl1_inv
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB6_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v1
; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: sub_i32_constant:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1132-NEXT: s_mov_b32 s5, exec_lo
-; GFX1132-NEXT: s_mov_b32 s4, exec_lo
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1132-NEXT: ; implicit-def: $vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB6_2
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1132-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1132-NEXT: s_mul_i32 s5, s5, 5
+; GFX1132-NEXT: s_mul_i32 s1, s1, 5
; GFX1132-NEXT: s_mov_b32 s10, -1
-; GFX1132-NEXT: v_mov_b32_e32 v1, s5
+; GFX1132-NEXT: v_mov_b32_e32 v1, s1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mov_b32 s8, s2
-; GFX1132-NEXT: s_mov_b32 s9, s3
+; GFX1132-NEXT: s_mov_b32 s8, s6
+; GFX1132-NEXT: s_mov_b32 s9, s7
; GFX1132-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: buffer_gl1_inv
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB6_2:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v1
; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
;
; GFX1264-LABEL: sub_i32_constant:
; GFX1264: ; %bb.0: ; %entry
-; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1264-NEXT: s_mov_b64 s[6:7], exec
-; GFX1264-NEXT: s_mov_b64 s[4:5], exec
-; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1264-NEXT: s_mov_b64 s[2:3], exec
+; GFX1264-NEXT: s_mov_b64 s[0:1], exec
+; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1264-NEXT: ; implicit-def: $vgpr1
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1264-NEXT: s_cbranch_execz .LBB6_2
; GFX1264-NEXT: ; %bb.1:
-; GFX1264-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1264-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1264-NEXT: s_mul_i32 s6, s6, 5
+; GFX1264-NEXT: s_mul_i32 s2, s2, 5
; GFX1264-NEXT: s_mov_b32 s10, -1
-; GFX1264-NEXT: v_mov_b32_e32 v1, s6
+; GFX1264-NEXT: v_mov_b32_e32 v1, s2
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: s_mov_b32 s8, s2
-; GFX1264-NEXT: s_mov_b32 s9, s3
+; GFX1264-NEXT: s_mov_b32 s8, s6
+; GFX1264-NEXT: s_mov_b32 s9, s7
; GFX1264-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX1264-NEXT: s_wait_loadcnt 0x0
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
; GFX1264-NEXT: .LBB6_2:
-; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1264-NEXT: v_readfirstlane_b32 s0, v1
; GFX1264-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264-NEXT: s_wait_kmcnt 0x0
+; GFX1264-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1264-NEXT: s_mov_b32 s6, -1
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1264-NEXT: s_mov_b32 s2, -1
-; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1264-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX1264-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX1264-NEXT: s_nop 0
; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1264-NEXT: s_endpgm
;
; GFX1232-LABEL: sub_i32_constant:
; GFX1232: ; %bb.0: ; %entry
-; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1232-NEXT: s_mov_b32 s5, exec_lo
-; GFX1232-NEXT: s_mov_b32 s4, exec_lo
-; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1232-NEXT: s_mov_b32 s2, exec_lo
+; GFX1232-NEXT: s_mov_b32 s0, exec_lo
+; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1232-NEXT: ; implicit-def: $vgpr1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1232-NEXT: s_cbranch_execz .LBB6_2
; GFX1232-NEXT: ; %bb.1:
-; GFX1232-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1232-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1232-NEXT: s_mul_i32 s5, s5, 5
+; GFX1232-NEXT: s_mul_i32 s1, s1, 5
; GFX1232-NEXT: s_mov_b32 s10, -1
-; GFX1232-NEXT: v_mov_b32_e32 v1, s5
+; GFX1232-NEXT: v_mov_b32_e32 v1, s1
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: s_mov_b32 s8, s2
-; GFX1232-NEXT: s_mov_b32 s9, s3
+; GFX1232-NEXT: s_mov_b32 s8, s6
+; GFX1232-NEXT: s_mov_b32 s9, s7
; GFX1232-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX1232-NEXT: s_wait_loadcnt 0x0
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
; GFX1232-NEXT: .LBB6_2:
-; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1232-NEXT: v_readfirstlane_b32 s0, v1
; GFX1232-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1232-NEXT: s_wait_kmcnt 0x0
+; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s6, -1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1232-NEXT: s_mov_b32 s2, -1
-; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1232-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX1232-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX1232-NEXT: s_nop 0
; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1232-NEXT: s_endpgm
@@ -2514,280 +2513,280 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX8-LABEL: sub_i32_varying:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_mov_b64 s[2:3], exec
-; GFX8-NEXT: s_mov_b32 s6, 0
+; GFX8-NEXT: s_mov_b32 s8, 0
; GFX8-NEXT: ; implicit-def: $vgpr1
; GFX8-NEXT: .LBB8_1: ; %ComputeLoop
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_ff1_i32_b64 s4, s[2:3]
; GFX8-NEXT: s_mov_b32 m0, s4
-; GFX8-NEXT: v_readlane_b32 s7, v0, s4
+; GFX8-NEXT: v_readlane_b32 s6, v0, s4
; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4
-; GFX8-NEXT: v_writelane_b32 v1, s6, m0
-; GFX8-NEXT: s_add_i32 s6, s6, s7
+; GFX8-NEXT: v_writelane_b32 v1, s8, m0
+; GFX8-NEXT: s_add_i32 s8, s8, s6
; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX8-NEXT: s_cbranch_scc1 .LBB8_1
; GFX8-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: ; implicit-def: $vgpr0
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX8-NEXT: s_cbranch_execz .LBB8_4
; GFX8-NEXT: ; %bb.3:
-; GFX8-NEXT: s_mov_b32 s11, 0xf000
-; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s15, 0xf000
+; GFX8-NEXT: s_mov_b32 s14, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s8, s2
-; GFX8-NEXT: s_mov_b32 s9, s3
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
+; GFX8-NEXT: s_mov_b32 s12, s6
+; GFX8-NEXT: s_mov_b32 s13, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s8
+; GFX8-NEXT: buffer_atomic_sub v0, off, s[12:15], 0 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: .LBB8_4:
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v1
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v1
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i32_varying:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: s_mov_b32 s6, 0
+; GFX9-NEXT: s_mov_b32 s8, 0
; GFX9-NEXT: ; implicit-def: $vgpr1
; GFX9-NEXT: .LBB8_1: ; %ComputeLoop
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_ff1_i32_b64 s4, s[2:3]
; GFX9-NEXT: s_mov_b32 m0, s4
-; GFX9-NEXT: v_readlane_b32 s7, v0, s4
+; GFX9-NEXT: v_readlane_b32 s6, v0, s4
; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4
-; GFX9-NEXT: v_writelane_b32 v1, s6, m0
-; GFX9-NEXT: s_add_i32 s6, s6, s7
+; GFX9-NEXT: v_writelane_b32 v1, s8, m0
+; GFX9-NEXT: s_add_i32 s8, s8, s6
; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB8_1
; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: ; implicit-def: $vgpr0
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX9-NEXT: s_cbranch_execz .LBB8_4
; GFX9-NEXT: ; %bb.3:
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_mov_b32 s15, 0xf000
+; GFX9-NEXT: s_mov_b32 s14, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
-; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: buffer_atomic_sub v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: .LBB8_4:
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_sub_u32_e32 v0, s4, v1
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v1
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: sub_i32_varying:
; GFX1064: ; %bb.0: ; %entry
; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: s_mov_b32 s6, 0
+; GFX1064-NEXT: s_mov_b32 s8, 0
; GFX1064-NEXT: ; implicit-def: $vgpr1
; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1064-NEXT: s_ff1_i32_b64 s7, s[2:3]
-; GFX1064-NEXT: v_readlane_b32 s8, v0, s7
-; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s7
-; GFX1064-NEXT: v_writelane_b32 v1, s6, s7
+; GFX1064-NEXT: s_ff1_i32_b64 s6, s[2:3]
+; GFX1064-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s6
+; GFX1064-NEXT: v_writelane_b32 v1, s8, s6
; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GFX1064-NEXT: s_add_i32 s6, s6, s8
+; GFX1064-NEXT: s_add_i32 s8, s8, s7
; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX1064-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execz .LBB8_4
; GFX1064-NEXT: ; %bb.3:
-; GFX1064-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064-NEXT: v_mov_b32_e32 v0, s8
; GFX1064-NEXT: s_mov_b32 s11, 0x31016000
; GFX1064-NEXT: s_mov_b32 s10, -1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_mov_b32 s8, s2
-; GFX1064-NEXT: s_mov_b32 s9, s3
+; GFX1064-NEXT: s_mov_b32 s8, s6
+; GFX1064-NEXT: s_mov_b32 s9, s7
; GFX1064-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB8_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v1
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: sub_i32_varying:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_mov_b32 s2, exec_lo
-; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: ; implicit-def: $vgpr1
; GFX1032-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1032-NEXT: s_ff1_i32_b32 s3, s2
-; GFX1032-NEXT: v_readlane_b32 s5, v0, s3
-; GFX1032-NEXT: s_lshl_b32 s6, 1, s3
-; GFX1032-NEXT: v_writelane_b32 v1, s4, s3
-; GFX1032-NEXT: s_andn2_b32 s2, s2, s6
-; GFX1032-NEXT: s_add_i32 s4, s4, s5
-; GFX1032-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1032-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1032-NEXT: s_lshl_b32 s6, 1, s4
+; GFX1032-NEXT: v_writelane_b32 v1, s2, s4
+; GFX1032-NEXT: s_andn2_b32 s3, s3, s6
+; GFX1032-NEXT: s_add_i32 s2, s2, s5
+; GFX1032-NEXT: s_cmp_lg_u32 s3, 0
; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: ; implicit-def: $vgpr0
-; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo
-; GFX1032-NEXT: s_xor_b32 s5, exec_lo, s5
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1032-NEXT: s_cbranch_execz .LBB8_4
; GFX1032-NEXT: ; %bb.3:
-; GFX1032-NEXT: v_mov_b32_e32 v0, s4
+; GFX1032-NEXT: v_mov_b32_e32 v0, s2
; GFX1032-NEXT: s_mov_b32 s11, 0x31016000
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_mov_b32 s8, s2
-; GFX1032-NEXT: s_mov_b32 s9, s3
+; GFX1032-NEXT: s_mov_b32 s8, s6
+; GFX1032-NEXT: s_mov_b32 s9, s7
; GFX1032-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB8_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v1
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: sub_i32_varying:
; GFX1164: ; %bb.0: ; %entry
; GFX1164-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164-NEXT: s_mov_b32 s6, 0
+; GFX1164-NEXT: s_mov_b32 s8, 0
; GFX1164-NEXT: ; implicit-def: $vgpr1
; GFX1164-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1164-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1164-NEXT: s_ctz_i32_b64 s6, s[2:3]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_readlane_b32 s8, v0, s7
-; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s7
-; GFX1164-NEXT: v_writelane_b32 v1, s6, s7
+; GFX1164-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s6
+; GFX1164-NEXT: v_writelane_b32 v1, s8, s6
; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5]
-; GFX1164-NEXT: s_add_i32 s6, s6, s8
+; GFX1164-NEXT: s_add_i32 s8, s8, s7
; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1164-NEXT: ; implicit-def: $vgpr0
-; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1164-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execz .LBB8_4
; GFX1164-NEXT: ; %bb.3:
-; GFX1164-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164-NEXT: v_mov_b32_e32 v0, s8
; GFX1164-NEXT: s_mov_b32 s11, 0x31016000
; GFX1164-NEXT: s_mov_b32 s10, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mov_b32 s8, s2
-; GFX1164-NEXT: s_mov_b32 s9, s3
+; GFX1164-NEXT: s_mov_b32 s8, s6
+; GFX1164-NEXT: s_mov_b32 s9, s7
; GFX1164-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: buffer_gl1_inv
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB8_4:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s0, v1
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: sub_i32_varying:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s2, 0
; GFX1132-NEXT: ; implicit-def: $vgpr1
; GFX1132-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1132-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1132-NEXT: s_ctz_i32_b32 s4, s3
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1132-NEXT: v_readlane_b32 s5, v0, s3
-; GFX1132-NEXT: s_lshl_b32 s6, 1, s3
-; GFX1132-NEXT: v_writelane_b32 v1, s4, s3
-; GFX1132-NEXT: s_and_not1_b32 s2, s2, s6
-; GFX1132-NEXT: s_add_i32 s4, s4, s5
-; GFX1132-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1132-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1132-NEXT: s_lshl_b32 s6, 1, s4
+; GFX1132-NEXT: v_writelane_b32 v1, s2, s4
+; GFX1132-NEXT: s_and_not1_b32 s3, s3, s6
+; GFX1132-NEXT: s_add_i32 s2, s2, s5
+; GFX1132-NEXT: s_cmp_lg_u32 s3, 0
; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1132-NEXT: ; implicit-def: $vgpr0
-; GFX1132-NEXT: s_and_saveexec_b32 s5, vcc_lo
-; GFX1132-NEXT: s_xor_b32 s5, exec_lo, s5
+; GFX1132-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1132-NEXT: s_cbranch_execz .LBB8_4
; GFX1132-NEXT: ; %bb.3:
-; GFX1132-NEXT: v_mov_b32_e32 v0, s4
+; GFX1132-NEXT: v_mov_b32_e32 v0, s2
; GFX1132-NEXT: s_mov_b32 s11, 0x31016000
; GFX1132-NEXT: s_mov_b32 s10, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mov_b32 s8, s2
-; GFX1132-NEXT: s_mov_b32 s9, s3
+; GFX1132-NEXT: s_mov_b32 s8, s6
+; GFX1132-NEXT: s_mov_b32 s9, s7
; GFX1132-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: buffer_gl1_inv
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB8_4:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v1
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -2795,97 +2794,97 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-LABEL: sub_i32_varying:
; GFX1264: ; %bb.0: ; %entry
; GFX1264-NEXT: s_mov_b64 s[2:3], exec
-; GFX1264-NEXT: s_mov_b32 s6, 0
+; GFX1264-NEXT: s_mov_b32 s8, 0
; GFX1264-NEXT: ; implicit-def: $vgpr1
; GFX1264-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1264-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1264-NEXT: s_ctz_i32_b64 s7, s[2:3]
+; GFX1264-NEXT: s_ctz_i32_b64 s6, s[2:3]
; GFX1264-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1264-NEXT: v_readlane_b32 s8, v0, s7
-; GFX1264-NEXT: s_lshl_b64 s[4:5], 1, s7
-; GFX1264-NEXT: v_writelane_b32 v1, s6, s7
+; GFX1264-NEXT: v_readlane_b32 s7, v0, s6
+; GFX1264-NEXT: s_lshl_b64 s[4:5], 1, s6
+; GFX1264-NEXT: v_writelane_b32 v1, s8, s6
; GFX1264-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5]
-; GFX1264-NEXT: s_add_co_i32 s6, s6, s8
+; GFX1264-NEXT: s_add_co_i32 s8, s8, s7
; GFX1264-NEXT: s_cmp_lg_u64 s[2:3], 0
; GFX1264-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1264-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
; GFX1264-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1264-NEXT: ; implicit-def: $vgpr0
-; GFX1264-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1264-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1264-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1264-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX1264-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX1264-NEXT: s_cbranch_execz .LBB8_4
; GFX1264-NEXT: ; %bb.3:
-; GFX1264-NEXT: v_mov_b32_e32 v0, s6
+; GFX1264-NEXT: v_mov_b32_e32 v0, s8
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
; GFX1264-NEXT: s_mov_b32 s10, -1
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: s_mov_b32 s8, s2
-; GFX1264-NEXT: s_mov_b32 s9, s3
+; GFX1264-NEXT: s_mov_b32 s8, s6
+; GFX1264-NEXT: s_mov_b32 s9, s7
; GFX1264-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX1264-NEXT: s_wait_loadcnt 0x0
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
; GFX1264-NEXT: .LBB8_4:
-; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1264-NEXT: v_readfirstlane_b32 s0, v0
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1264-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1264-NEXT: s_mov_b32 s6, -1
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1264-NEXT: s_mov_b32 s2, -1
-; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1264-NEXT: v_sub_nc_u32_e32 v0, s0, v1
+; GFX1264-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX1264-NEXT: s_nop 0
; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1264-NEXT: s_endpgm
;
; GFX1232-LABEL: sub_i32_varying:
; GFX1232: ; %bb.0: ; %entry
-; GFX1232-NEXT: s_mov_b32 s2, exec_lo
-; GFX1232-NEXT: s_mov_b32 s4, 0
+; GFX1232-NEXT: s_mov_b32 s3, exec_lo
+; GFX1232-NEXT: s_mov_b32 s2, 0
; GFX1232-NEXT: ; implicit-def: $vgpr1
; GFX1232-NEXT: .LBB8_1: ; %ComputeLoop
; GFX1232-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1232-NEXT: s_ctz_i32_b32 s3, s2
+; GFX1232-NEXT: s_ctz_i32_b32 s4, s3
; GFX1232-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX1232-NEXT: v_readlane_b32 s5, v0, s3
-; GFX1232-NEXT: s_lshl_b32 s6, 1, s3
-; GFX1232-NEXT: v_writelane_b32 v1, s4, s3
-; GFX1232-NEXT: s_and_not1_b32 s2, s2, s6
-; GFX1232-NEXT: s_add_co_i32 s4, s4, s5
-; GFX1232-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1232-NEXT: v_readlane_b32 s5, v0, s4
+; GFX1232-NEXT: s_lshl_b32 s6, 1, s4
+; GFX1232-NEXT: v_writelane_b32 v1, s2, s4
+; GFX1232-NEXT: s_and_not1_b32 s3, s3, s6
+; GFX1232-NEXT: s_add_co_i32 s2, s2, s5
+; GFX1232-NEXT: s_cmp_lg_u32 s3, 0
; GFX1232-NEXT: s_cbranch_scc1 .LBB8_1
; GFX1232-NEXT: ; %bb.2: ; %ComputeEnd
-; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1232-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1232-NEXT: ; implicit-def: $vgpr0
-; GFX1232-NEXT: s_and_saveexec_b32 s5, vcc_lo
-; GFX1232-NEXT: s_xor_b32 s5, exec_lo, s5
+; GFX1232-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1232-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1232-NEXT: s_cbranch_execz .LBB8_4
; GFX1232-NEXT: ; %bb.3:
-; GFX1232-NEXT: v_mov_b32_e32 v0, s4
+; GFX1232-NEXT: v_mov_b32_e32 v0, s2
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
; GFX1232-NEXT: s_mov_b32 s10, -1
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: s_mov_b32 s8, s2
-; GFX1232-NEXT: s_mov_b32 s9, s3
+; GFX1232-NEXT: s_mov_b32 s8, s6
+; GFX1232-NEXT: s_mov_b32 s9, s7
; GFX1232-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX1232-NEXT: s_wait_loadcnt 0x0
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
; GFX1232-NEXT: .LBB8_4:
-; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1232-NEXT: v_readfirstlane_b32 s0, v0
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s6, -1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1232-NEXT: s_mov_b32 s2, -1
-; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX1232-NEXT: v_sub_nc_u32_e32 v0, s0, v1
+; GFX1232-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX1232-NEXT: s_nop 0
; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1232-NEXT: s_endpgm
@@ -2938,317 +2937,313 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
;
; GFX8-LABEL: sub_i64_constant:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b64 s[6:7], exec
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b64 s[2:3], exec
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8-NEXT: s_cbranch_execz .LBB9_2
; GFX8-NEXT: ; %bb.1:
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s8, s2
-; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7]
+; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX8-NEXT: s_mul_i32 s2, s2, 5
; GFX8-NEXT: s_mov_b32 s11, 0xf000
; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s9, s3
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_mov_b32 s8, s6
+; GFX8-NEXT: s_mov_b32 s9, s7
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: .LBB9_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: v_readfirstlane_b32 s5, v1
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2
; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX8-NEXT: v_mov_b32_e32 v2, s5
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i64_constant:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[6:7], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9-NEXT: s_cbranch_execz .LBB9_2
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7]
+; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX9-NEXT: s_mul_i32 s2, s2, 5
; GFX9-NEXT: s_mov_b32 s11, 0xf000
; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: .LBB9_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: v_readfirstlane_b32 s5, v1
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2
; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: sub_i64_constant:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1064-NEXT: s_mov_b64 s[6:7], exec
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB9_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-NEXT: s_mul_i32 s6, s6, 5
+; GFX1064-NEXT: s_mul_i32 s2, s2, 5
; GFX1064-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1064-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064-NEXT: v_mov_b32_e32 v0, s2
; GFX1064-NEXT: s_mov_b32 s10, -1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_mov_b32 s8, s2
-; GFX1064-NEXT: s_mov_b32 s9, s3
+; GFX1064-NEXT: s_mov_b32 s8, s6
+; GFX1064-NEXT: s_mov_b32 s9, s7
; GFX1064-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: buffer_gl1_inv
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB9_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0
-; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s0, v0
+; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: sub_i64_constant:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB9_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-NEXT: s_mul_i32 s5, s5, 5
+; GFX1032-NEXT: s_mul_i32 s1, s1, 5
; GFX1032-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1032-NEXT: v_mov_b32_e32 v0, s5
+; GFX1032-NEXT: v_mov_b32_e32 v0, s1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_mov_b32 s8, s2
-; GFX1032-NEXT: s_mov_b32 s9, s3
+; GFX1032-NEXT: s_mov_b32 s8, s6
+; GFX1032-NEXT: s_mov_b32 s9, s7
; GFX1032-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: buffer_gl1_inv
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB9_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0
-; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0
+; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: sub_i64_constant:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1164-NEXT: s_mov_b64 s[6:7], exec
-; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1164-NEXT: s_cbranch_execz .LBB9_2
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: s_mul_i32 s6, s6, 5
+; GFX1164-NEXT: s_mul_i32 s2, s2, 5
; GFX1164-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1164-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164-NEXT: v_mov_b32_e32 v0, s2
; GFX1164-NEXT: s_mov_b32 s10, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mov_b32 s8, s2
-; GFX1164-NEXT: s_mov_b32 s9, s3
+; GFX1164-NEXT: s_mov_b32 s8, s6
+; GFX1164-NEXT: s_mov_b32 s9, s7
; GFX1164-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: buffer_gl1_inv
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB9_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
; GFX1164-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v0
-; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
+; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s0, v0
+; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: sub_i64_constant:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1132-NEXT: s_mov_b32 s5, exec_lo
-; GFX1132-NEXT: s_mov_b32 s4, exec_lo
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0
+; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0
; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1132-NEXT: s_cbranch_execz .LBB9_2
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1132-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1132-NEXT: s_mul_i32 s5, s5, 5
+; GFX1132-NEXT: s_mul_i32 s1, s1, 5
; GFX1132-NEXT: s_mov_b32 s10, -1
-; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0
+; GFX1132-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mov_b32 s8, s2
-; GFX1132-NEXT: s_mov_b32 s9, s3
+; GFX1132-NEXT: s_mov_b32 s8, s6
+; GFX1132-NEXT: s_mov_b32 s9, s7
; GFX1132-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: buffer_gl1_inv
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB9_2:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
; GFX1132-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0
-; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
+; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0
+; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
;
; GFX1264-LABEL: sub_i64_constant:
; GFX1264: ; %bb.0: ; %entry
-; GFX1264-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1264-NEXT: s_mov_b64 s[6:7], exec
+; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1264-NEXT: s_mov_b64 s[2:3], exec
; GFX1264-NEXT: s_mov_b32 s9, 0
-; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1264-NEXT: s_mov_b64 s[4:5], exec
+; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1264-NEXT: s_mov_b64 s[0:1], exec
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1264-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1264-NEXT: s_cbranch_execz .LBB9_2
; GFX1264-NEXT: ; %bb.1:
-; GFX1264-NEXT: s_bcnt1_i32_b64 s8, s[6:7]
+; GFX1264-NEXT: s_bcnt1_i32_b64 s8, s[2:3]
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1264-NEXT: s_mul_u64 s[6:7], s[8:9], 5
+; GFX1264-NEXT: s_mul_u64 s[2:3], s[8:9], 5
; GFX1264-NEXT: s_mov_b32 s10, -1
-; GFX1264-NEXT: v_mov_b32_e32 v0, s6
-; GFX1264-NEXT: v_mov_b32_e32 v1, s7
+; GFX1264-NEXT: v_mov_b32_e32 v0, s2
+; GFX1264-NEXT: v_mov_b32_e32 v1, s3
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: s_mov_b32 s8, s2
-; GFX1264-NEXT: s_mov_b32 s9, s3
+; GFX1264-NEXT: s_mov_b32 s8, s6
+; GFX1264-NEXT: s_mov_b32 s9, s7
; GFX1264-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX1264-NEXT: s_wait_loadcnt 0x0
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
; GFX1264-NEXT: .LBB9_2:
-; GFX1264-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1264-NEXT: v_readfirstlane_b32 s0, v0
; GFX1264-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX1264-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1264-NEXT: v_readfirstlane_b32 s1, v1
; GFX1264-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1264-NEXT: v_sub_co_u32 v0, vcc, s2, v0
-; GFX1264-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
-; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1264-NEXT: s_mov_b32 s2, -1
-; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1264-NEXT: s_wait_kmcnt 0x0
+; GFX1264-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1264-NEXT: s_mov_b32 s6, -1
+; GFX1264-NEXT: v_sub_co_u32 v0, vcc, s0, v0
+; GFX1264-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
+; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
; GFX1264-NEXT: s_nop 0
; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1264-NEXT: s_endpgm
;
; GFX1232-LABEL: sub_i64_constant:
; GFX1232: ; %bb.0: ; %entry
-; GFX1232-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1232-NEXT: s_mov_b32 s4, exec_lo
-; GFX1232-NEXT: s_mov_b32 s5, 0
-; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s4, 0
-; GFX1232-NEXT: s_mov_b32 s6, exec_lo
+; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1232-NEXT: s_mov_b32 s3, exec_lo
+; GFX1232-NEXT: s_mov_b32 s1, 0
+; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0
+; GFX1232-NEXT: s_mov_b32 s2, exec_lo
; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1232-NEXT: s_cbranch_execz .LBB9_2
; GFX1232-NEXT: ; %bb.1:
-; GFX1232-NEXT: s_bcnt1_i32_b32 s4, s4
+; GFX1232-NEXT: s_bcnt1_i32_b32 s0, s3
; GFX1232-NEXT: s_mov_b32 s11, 0x31016000
-; GFX1232-NEXT: s_mul_u64 s[4:5], s[4:5], 5
+; GFX1232-NEXT: s_mul_u64 s[0:1], s[0:1], 5
; GFX1232-NEXT: s_mov_b32 s10, -1
-; GFX1232-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX1232-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: s_mov_b32 s8, s2
-; GFX1232-NEXT: s_mov_b32 s9, s3
+; GFX1232-NEXT: s_mov_b32 s8, s6
+; GFX1232-NEXT: s_mov_b32 s9, s7
; GFX1232-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX1232-NEXT: s_wait_loadcnt 0x0
; GFX1232-NEXT: global_inv scope:SCOPE_DEV
; GFX1232-NEXT: .LBB9_2:
-; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s6
-; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1232-NEXT: v_readfirstlane_b32 s0, v0
; GFX1232-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX1232-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1232-NEXT: v_readfirstlane_b32 s1, v1
; GFX1232-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1232-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0
-; GFX1232-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
-; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1232-NEXT: s_mov_b32 s2, -1
-; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX1232-NEXT: s_wait_kmcnt 0x0
+; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1232-NEXT: s_mov_b32 s6, -1
+; GFX1232-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0
+; GFX1232-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
; GFX1232-NEXT: s_nop 0
; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1232-NEXT: s_endpgm
@@ -3307,21 +3302,21 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX8-LABEL: sub_i64_uniform:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: s_mov_b64 s[8:9], exec
; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8-NEXT: s_cbranch_execz .LBB10_2
; GFX8-NEXT: ; %bb.1:
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 s12, s6
; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[8:9]
; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s0, v0, 0
-; GFX8-NEXT: s_mul_i32 s6, s1, s6
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s2, v0, 0
+; GFX8-NEXT: s_mul_i32 s6, s3, s6
; GFX8-NEXT: s_mov_b32 s15, 0xf000
; GFX8-NEXT: s_mov_b32 s14, -1
; GFX8-NEXT: s_mov_b32 s13, s7
@@ -3330,10 +3325,10 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: .LBB10_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mul_lo_u32 v4, s1, v2
-; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s0, v2, 0
+; GFX8-NEXT: v_mul_lo_u32 v4, s3, v2
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: v_readfirstlane_b32 s1, v1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4
@@ -3481,9 +3476,9 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1164: ; %bb.0: ; %entry
; GFX1164-NEXT: s_clause 0x1
; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1164-NEXT: s_mov_b64 s[8:9], exec
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0
@@ -3494,9 +3489,9 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1164-NEXT: s_bcnt1_i32_b64 s8, s[8:9]
; GFX1164-NEXT: s_mov_b32 s11, 0x31016000
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mul_i32 s9, s1, s8
-; GFX1164-NEXT: s_mul_hi_u32 s10, s0, s8
-; GFX1164-NEXT: s_mul_i32 s8, s0, s8
+; GFX1164-NEXT: s_mul_i32 s9, s3, s8
+; GFX1164-NEXT: s_mul_hi_u32 s10, s2, s8
+; GFX1164-NEXT: s_mul_i32 s8, s2, s8
; GFX1164-NEXT: s_add_i32 s10, s10, s9
; GFX1164-NEXT: v_mov_b32_e32 v0, s8
; GFX1164-NEXT: v_mov_b32_e32 v1, s10
@@ -3508,17 +3503,17 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1164-NEXT: buffer_gl1_inv
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB10_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s0, v2, 0
+; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0
; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s1, v2, v[4:5]
-; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5]
; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s0, v3
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v1, v5
; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
@@ -3530,24 +3525,24 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1132: ; %bb.0: ; %entry
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_mov_b32 s2, exec_lo
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX1132-NEXT: s_mov_b32 s8, exec_lo
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s8, 0
; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1132-NEXT: s_cbranch_execz .LBB10_2
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s8
; GFX1132-NEXT: s_mov_b32 s11, 0x31016000
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mul_i32 s8, s1, s3
-; GFX1132-NEXT: s_mul_hi_u32 s9, s0, s3
-; GFX1132-NEXT: s_mul_i32 s3, s0, s3
+; GFX1132-NEXT: s_mul_i32 s8, s3, s1
+; GFX1132-NEXT: s_mul_hi_u32 s9, s2, s1
+; GFX1132-NEXT: s_mul_i32 s1, s2, s1
; GFX1132-NEXT: s_add_i32 s9, s9, s8
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s9
+; GFX1132-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s9
; GFX1132-NEXT: s_mov_b32 s10, -1
; GFX1132-NEXT: s_mov_b32 s8, s6
; GFX1132-NEXT: s_mov_b32 s9, s7
@@ -3556,17 +3551,17 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1132-NEXT: buffer_gl1_inv
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB10_2:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s0, v2, 0
+; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0
; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132-NEXT: s_mov_b32 s6, -1
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s1, v2, v[4:5]
-; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5]
; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v3
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_mov_b32_e32 v1, v5
; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
@@ -3578,11 +3573,11 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264: ; %bb.0: ; %entry
; GFX1264-NEXT: s_clause 0x1
; GFX1264-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1264-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1264-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1264-NEXT: s_mov_b64 s[8:9], exec
; GFX1264-NEXT: s_mov_b32 s11, 0
; GFX1264-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
-; GFX1264-NEXT: s_mov_b64 s[2:3], exec
+; GFX1264-NEXT: s_mov_b64 s[0:1], exec
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0
; GFX1264-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -3591,7 +3586,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: ; %bb.1:
; GFX1264-NEXT: s_bcnt1_i32_b64 s10, s[8:9]
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: s_mul_u64 s[8:9], s[0:1], s[10:11]
+; GFX1264-NEXT: s_mul_u64 s[8:9], s[2:3], s[10:11]
; GFX1264-NEXT: s_mov_b32 s11, 0x31016000
; GFX1264-NEXT: v_mov_b32_e32 v0, s8
; GFX1264-NEXT: v_mov_b32_e32 v1, s9
@@ -3602,17 +3597,17 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1264-NEXT: s_wait_loadcnt 0x0
; GFX1264-NEXT: global_inv scope:SCOPE_DEV
; GFX1264-NEXT: .LBB10_2:
-; GFX1264-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1264-NEXT: s_wait_kmcnt 0x0
-; GFX1264-NEXT: v_mad_co_u64_u32 v[3:4], null, s0, v2, 0
+; GFX1264-NEXT: v_mad_co_u64_u32 v[3:4], null, s2, v2, 0
; GFX1264-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1264-NEXT: v_readfirstlane_b32 s1, v1
; GFX1264-NEXT: s_mov_b32 s7, 0x31016000
; GFX1264-NEXT: s_mov_b32 s6, -1
-; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX1264-NEXT: v_mad_co_u64_u32 v[4:5], null, s1, v2, v[4:5]
-; GFX1264-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1264-NEXT: v_mad_co_u64_u32 v[4:5], null, s3, v2, v[4:5]
; GFX1264-NEXT: v_sub_co_u32 v0, vcc, s0, v3
-; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1264-NEXT: v_mov_b32_e32 v1, v4
; GFX1264-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
@@ -3624,22 +3619,22 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232: ; %bb.0: ; %entry
; GFX1232-NEXT: s_clause 0x1
; GFX1232-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1232-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX1232-NEXT: s_mov_b32 s2, exec_lo
-; GFX1232-NEXT: s_mov_b32 s3, 0
-; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0
+; GFX1232-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX1232-NEXT: s_mov_b32 s9, exec_lo
+; GFX1232-NEXT: s_mov_b32 s1, 0
+; GFX1232-NEXT: v_mbcnt_lo_u32_b32 v2, s9, 0
; GFX1232-NEXT: s_mov_b32 s8, exec_lo
; GFX1232-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1232-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1232-NEXT: s_cbranch_execz .LBB10_2
; GFX1232-NEXT: ; %bb.1:
-; GFX1232-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1232-NEXT: s_bcnt1_i32_b32 s0, s9
; GFX1232-NEXT: s_mov_b32 s15, 0x31016000
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: s_mul_u64 s[2:3], s[0:1], s[2:3]
+; GFX1232-NEXT: s_mul_u64 s[0:1], s[2:3], s[0:1]
; GFX1232-NEXT: s_mov_b32 s14, -1
-; GFX1232-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1232-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX1232-NEXT: s_mov_b32 s12, s6
; GFX1232-NEXT: s_mov_b32 s13, s7
; GFX1232-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[12:15], null th:TH_ATOMIC_RETURN
@@ -3648,15 +3643,15 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX1232-NEXT: .LBB10_2:
; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX1232-NEXT: s_wait_kmcnt 0x0
-; GFX1232-NEXT: v_mad_co_u64_u32 v[3:4], null, s0, v2, 0
+; GFX1232-NEXT: v_mad_co_u64_u32 v[3:4], null, s2, v2, 0
; GFX1232-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1232-NEXT: v_readfirstlane_b32 s1, v1
; GFX1232-NEXT: s_mov_b32 s7, 0x31016000
; GFX1232-NEXT: s_mov_b32 s6, -1
-; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX1232-NEXT: v_mad_co_u64_u32 v[4:5], null, s1, v2, v[4:5]
-; GFX1232-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1232-NEXT: v_mad_co_u64_u32 v[4:5], null, s3, v2, v[4:5]
; GFX1232-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v3
-; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1232-NEXT: v_mov_b32_e32 v1, v4
; GFX1232-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
@@ -3691,82 +3686,82 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
;
; GFX89-LABEL: sub_i64_varying:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: v_mov_b32_e32 v1, 0
; GFX89-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: buffer_wbinvl1_vol
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
-; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
+; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX10-LABEL: sub_i64_varying:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s11, s7
-; GFX10-NEXT: s_mov_b32 s10, s6
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s11, s3
+; GFX10-NEXT: s_mov_b32 s10, s2
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
-; GFX10-NEXT: s_mov_b32 s4, s0
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s0, s4
; GFX10-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_mov_b32 s5, s1
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX10-NEXT: s_mov_b32 s1, s5
+; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: sub_i64_varying:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s11, s7
-; GFX11-NEXT: s_mov_b32 s10, s6
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: s_mov_b32 s5, s1
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: s_mov_b32 s1, s5
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: sub_i64_varying:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-NEXT: s_mov_b32 s7, 0x31016000
-; GFX12-NEXT: s_mov_b32 s6, -1
-; GFX12-NEXT: s_mov_b32 s11, s7
-; GFX12-NEXT: s_mov_b32 s10, s6
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s11, s3
+; GFX12-NEXT: s_mov_b32 s10, s2
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s8, s2
-; GFX12-NEXT: s_mov_b32 s9, s3
-; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s8, s6
+; GFX12-NEXT: s_mov_b32 s9, s7
+; GFX12-NEXT: s_mov_b32 s0, s4
; GFX12-NEXT: buffer_atomic_sub_u64 v[0:1], off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_mov_b32 s5, s1
-; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
+; GFX12-NEXT: s_mov_b32 s1, s5
+; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index af6f69130910d..98a28b2b716a0 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -64,13 +64,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB0_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v1
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s4
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: add_i32_constant:
@@ -91,13 +91,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB0_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s4
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: add_i32_constant:
@@ -120,13 +120,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB0_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: add_i32_constant:
@@ -148,13 +149,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB0_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: add_i32_constant:
@@ -178,14 +180,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB0_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -209,14 +211,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB0_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -281,14 +283,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB1_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0
-; GFX8-NEXT: v_readfirstlane_b32 s4, v1
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: add_i32_uniform:
@@ -311,14 +313,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB1_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0
-; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_add_u32_e32 v0, s4, v0
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: add_i32_uniform:
@@ -343,13 +345,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
; GFX1064-NEXT: .LBB1_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3]
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v0, s[0:1]
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: add_i32_uniform:
@@ -373,13 +376,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
; GFX1032-NEXT: .LBB1_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s4, v1
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v0, s[4:5]
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1]
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: add_i32_uniform:
@@ -405,14 +409,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB1_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3]
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: buffer_store_b32 v1, off, s[0:3], 0
+; GFX1164-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[0:1]
+; GFX1164-NEXT: s_mov_b32 s6, -1
+; GFX1164-NEXT: buffer_store_b32 v1, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -438,14 +442,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB1_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s4, v1
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5]
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: buffer_store_b32 v1, off, s[0:3], 0
+; GFX1132-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[0:1]
+; GFX1132-NEXT: buffer_store_b32 v1, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -503,13 +507,13 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB2_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: add_i32_varying:
@@ -543,13 +547,13 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB2_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_add_u32_e32 v0, s4, v1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: add_i32_varying:
@@ -584,13 +588,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB2_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_add_nc_u32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_add_nc_u32_e32 v0, s0, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: add_i32_varying:
@@ -624,13 +629,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB2_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_add_nc_u32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_add_nc_u32_e32 v0, s0, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: add_i32_varying:
@@ -667,14 +673,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB2_4:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_add_nc_u32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_add_nc_u32_e32 v0, s0, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -711,14 +717,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB2_4:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_add_nc_u32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_add_nc_u32_e32 v0, s0, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -974,17 +980,16 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB4_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8-NEXT: v_readfirstlane_b32 s3, v1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 5, v[0:1]
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_nop 1
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: add_i64_constant:
@@ -1005,17 +1010,16 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB4_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
-; GFX9-NEXT: v_readfirstlane_b32 s3, v1
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 5, v[0:1]
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_nop 1
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: add_i64_constant:
@@ -1038,14 +1042,15 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB4_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
-; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3]
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 5, s[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: add_i64_constant:
@@ -1067,14 +1072,15 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB4_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
-; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3]
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, v2, 5, s[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: add_i64_constant:
@@ -1098,15 +1104,15 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB4_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -1131,15 +1137,15 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB4_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -1196,228 +1202,229 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
;
; GFX8-LABEL: add_i64_uniform:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b64 s[6:7], exec
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b64 s[2:3], exec
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8-NEXT: s_cbranch_execz .LBB5_2
; GFX8-NEXT: ; %bb.1:
-; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[6:7]
+; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[2:3]
; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0
-; GFX8-NEXT: s_mul_i32 s6, s3, s8
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, 0
+; GFX8-NEXT: s_mul_i32 s2, s7, s8
; GFX8-NEXT: v_mov_b32_e32 v3, 0
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB5_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: v_readfirstlane_b32 s5, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mul_lo_u32 v3, s3, v2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1]
-; GFX8-NEXT: s_mov_b32 s7, 0xf000
-; GFX8-NEXT: s_mov_b32 s6, -1
-; GFX8-NEXT: s_mov_b32 s4, s0
-; GFX8-NEXT: s_mov_b32 s5, s1
+; GFX8-NEXT: v_mul_lo_u32 v3, s7, v2
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v2, v[0:1]
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_mov_b32 s0, s4
+; GFX8-NEXT: s_mov_b32 s1, s5
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: add_i64_uniform:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[6:7], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9-NEXT: s_cbranch_execz .LBB5_2
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mul_i32 s7, s3, s6
-; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6
-; GFX9-NEXT: s_add_i32 s8, s8, s7
-; GFX9-NEXT: s_mul_i32 s6, s2, s6
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: s_mul_i32 s3, s7, s2
+; GFX9-NEXT: s_mul_hi_u32 s8, s6, s2
+; GFX9-NEXT: s_add_i32 s8, s8, s3
+; GFX9-NEXT: s_mul_i32 s2, s6, s2
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s8
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB5_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: v_readfirstlane_b32 s5, v1
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1]
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2]
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v2, v[0:1]
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v2, v[1:2]
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: s_nop 2
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: add_i64_uniform:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1064-NEXT: s_mov_b64 s[6:7], exec
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB5_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_mul_i32 s7, s3, s6
-; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6
-; GFX1064-NEXT: s_mul_i32 s6, s2, s6
-; GFX1064-NEXT: s_add_i32 s8, s8, s7
-; GFX1064-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064-NEXT: s_mul_i32 s3, s7, s2
+; GFX1064-NEXT: s_mul_hi_u32 s8, s6, s2
+; GFX1064-NEXT: s_mul_i32 s2, s6, s2
+; GFX1064-NEXT: s_add_i32 s8, s8, s3
+; GFX1064-NEXT: v_mov_b32_e32 v0, s2
; GFX1064-NEXT: v_mov_b32_e32 v1, s8
; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB5_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1064-NEXT: v_readfirstlane_b32 s4, v0
-; GFX1064-NEXT: v_readfirstlane_b32 s5, v1
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, s[4:5]
-; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2]
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v2, s[0:1]
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s7, v2, v[1:2]
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: add_i64_uniform:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB5_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_mul_i32 s6, s3, s5
-; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5
-; GFX1032-NEXT: s_mul_i32 s5, s2, s5
-; GFX1032-NEXT: s_add_i32 s7, s7, s6
-; GFX1032-NEXT: v_mov_b32_e32 v0, s5
-; GFX1032-NEXT: v_mov_b32_e32 v1, s7
+; GFX1032-NEXT: s_mul_i32 s2, s7, s1
+; GFX1032-NEXT: s_mul_hi_u32 s3, s6, s1
+; GFX1032-NEXT: s_mul_i32 s1, s6, s1
+; GFX1032-NEXT: s_add_i32 s3, s3, s2
+; GFX1032-NEXT: v_mov_b32_e32 v0, s1
+; GFX1032-NEXT: v_mov_b32_e32 v1, s3
; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB5_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT: v_readfirstlane_b32 s4, v0
-; GFX1032-NEXT: v_readfirstlane_b32 s5, v1
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v2, s[4:5]
-; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s2, s3, v2, v[1:2]
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s0, s6, v2, s[0:1]
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_mad_u64_u32 v[1:2], s0, s7, v2, v[1:2]
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: add_i64_uniform:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1164-NEXT: s_mov_b64 s[6:7], exec
-; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1164-NEXT: s_cbranch_execz .LBB5_2
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mul_i32 s7, s3, s6
-; GFX1164-NEXT: s_mul_hi_u32 s8, s2, s6
-; GFX1164-NEXT: s_mul_i32 s6, s2, s6
-; GFX1164-NEXT: s_add_i32 s8, s8, s7
-; GFX1164-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164-NEXT: s_mul_i32 s3, s7, s2
+; GFX1164-NEXT: s_mul_hi_u32 s8, s6, s2
+; GFX1164-NEXT: s_mul_i32 s2, s6, s2
+; GFX1164-NEXT: s_add_i32 s8, s8, s3
+; GFX1164-NEXT: v_mov_b32_e32 v0, s2
; GFX1164-NEXT: v_mov_b32_e32 v1, s8
; GFX1164-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB5_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX1164-NEXT: v_readfirstlane_b32 s4, v0
-; GFX1164-NEXT: v_readfirstlane_b32 s5, v1
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5]
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2]
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, s6, v2, s[0:1]
+; GFX1164-NEXT: s_mov_b32 s6, -1
+; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s7, v2, v[1:2]
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v1, v3
-; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: add_i64_uniform:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1132-NEXT: s_mov_b32 s5, exec_lo
-; GFX1132-NEXT: s_mov_b32 s4, exec_lo
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0
+; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0
; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1132-NEXT: s_cbranch_execz .LBB5_2
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mul_i32 s6, s3, s5
-; GFX1132-NEXT: s_mul_hi_u32 s7, s2, s5
-; GFX1132-NEXT: s_mul_i32 s5, s2, s5
-; GFX1132-NEXT: s_add_i32 s7, s7, s6
+; GFX1132-NEXT: s_mul_i32 s2, s7, s1
+; GFX1132-NEXT: s_mul_hi_u32 s3, s6, s1
+; GFX1132-NEXT: s_mul_i32 s1, s6, s1
+; GFX1132-NEXT: s_add_i32 s3, s3, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s7
+; GFX1132-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s3
; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB5_2:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX1132-NEXT: v_readfirstlane_b32 s4, v0
-; GFX1132-NEXT: v_readfirstlane_b32 s5, v1
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s2, v2, s[4:5]
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s3, v2, v[1:2]
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, s6, v2, s[0:1]
+; GFX1132-NEXT: s_mov_b32 s6, -1
+; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s7, v2, v[1:2]
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_mov_b32_e32 v1, v3
-; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -1447,51 +1454,51 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: add_i64_varying:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: add_i64_varying:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1]
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: add_i64_varying:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1]
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1554,14 +1561,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB7_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i32_constant:
@@ -1582,14 +1589,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB7_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: sub_i32_constant:
@@ -1612,14 +1619,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB7_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v1
; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: sub_i32_constant:
@@ -1641,14 +1649,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB7_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v1
; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: sub_i32_constant:
@@ -1672,15 +1681,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB7_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v1
; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -1704,15 +1713,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB7_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v1
; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -1777,14 +1786,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB8_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0
-; GFX8-NEXT: v_readfirstlane_b32 s4, v1
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i32_uniform:
@@ -1807,14 +1816,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB8_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0
-; GFX9-NEXT: v_readfirstlane_b32 s4, v1
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: sub_i32_uniform:
@@ -1839,14 +1848,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX1064-NEXT: .LBB8_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_mul_lo_u32 v0, s6, v0
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: sub_i32_uniform:
@@ -1870,14 +1879,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX1032-NEXT: .LBB8_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: sub_i32_uniform:
@@ -1903,15 +1912,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB8_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: v_mul_lo_u32 v0, s6, v0
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1164-NEXT: s_mov_b32 s2, -1
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -1937,15 +1946,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB8_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: v_mul_lo_u32 v0, s2, v0
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v1
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX1132-NEXT: s_mov_b32 s2, -1
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -2003,13 +2012,13 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB9_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i32_varying:
@@ -2043,13 +2052,13 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB9_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_sub_u32_e32 v0, s4, v1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: sub_i32_varying:
@@ -2084,13 +2093,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB9_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: sub_i32_varying:
@@ -2124,13 +2134,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB9_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: sub_i32_varying:
@@ -2167,14 +2178,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB9_4:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_sub_nc_u32_e32 v0, s0, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -2211,14 +2222,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB9_4:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_sub_nc_u32_e32 v0, s0, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -2474,18 +2485,18 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB11_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: v_readfirstlane_b32 s5, v1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2
; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX8-NEXT: v_mov_b32_e32 v2, s5
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i64_constant:
@@ -2506,18 +2517,18 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB11_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: v_readfirstlane_b32 s5, v1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2
; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: sub_i64_constant:
@@ -2540,17 +2551,18 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB11_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0
-; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s0, v0
+; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: sub_i64_constant:
@@ -2572,17 +2584,18 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB11_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0
-; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0
+; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: sub_i64_constant:
@@ -2606,18 +2619,18 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB11_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
; GFX1164-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
; GFX1164-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v0
-; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
+; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s0, v0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -2642,18 +2655,18 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB11_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2
-; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
; GFX1132-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0
-; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
+; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -2710,241 +2723,241 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
;
; GFX8-LABEL: sub_i64_uniform:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b64 s[6:7], exec
-; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b64 s[2:3], exec
+; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX8-NEXT: s_cbranch_execz .LBB12_2
; GFX8-NEXT: ; %bb.1:
-; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[6:7]
+; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[2:3]
; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0
-; GFX8-NEXT: s_mul_i32 s6, s3, s8
+; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, 0
+; GFX8-NEXT: s_mul_i32 s2, s7, s8
; GFX8-NEXT: v_mov_b32_e32 v3, 0
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1
; GFX8-NEXT: s_mov_b32 m0, -1
; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB12_2:
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s4, s0
-; GFX8-NEXT: s_mov_b32 s5, s1
-; GFX8-NEXT: v_mul_lo_u32 v4, s3, v2
-; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
+; GFX8-NEXT: s_mov_b32 s0, s4
+; GFX8-NEXT: s_mov_b32 s1, s5
+; GFX8-NEXT: v_mul_lo_u32 v4, s7, v2
+; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s6, v2, 0
+; GFX8-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8-NEXT: v_readfirstlane_b32 s5, v1
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: s_mov_b32 s7, 0xf000
-; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
+; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i64_uniform:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[6:7], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9-NEXT: s_cbranch_execz .LBB12_2
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mul_i32 s7, s3, s6
-; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6
-; GFX9-NEXT: s_add_i32 s8, s8, s7
-; GFX9-NEXT: s_mul_i32 s6, s2, s6
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: s_mul_i32 s3, s7, s2
+; GFX9-NEXT: s_mul_hi_u32 s8, s6, s2
+; GFX9-NEXT: s_add_i32 s8, s8, s3
+; GFX9-NEXT: s_mul_i32 s2, s6, s2
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s8
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB12_2:
-; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
-; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5]
-; GFX9-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-NEXT: v_readfirstlane_b32 s1, v1
+; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s6, v2, 0
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[6:7], s7, v2, v[4:5]
+; GFX9-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9-NEXT: v_readfirstlane_b32 s5, v1
; GFX9-NEXT: v_mov_b32_e32 v1, v4
-; GFX9-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v3
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v3
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: sub_i64_uniform:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1064-NEXT: s_mov_b64 s[6:7], exec
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB12_2
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_mul_i32 s7, s3, s6
-; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6
-; GFX1064-NEXT: s_mul_i32 s6, s2, s6
-; GFX1064-NEXT: s_add_i32 s8, s8, s7
-; GFX1064-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064-NEXT: s_mul_i32 s3, s7, s2
+; GFX1064-NEXT: s_mul_hi_u32 s8, s6, s2
+; GFX1064-NEXT: s_mul_i32 s2, s6, s2
+; GFX1064-NEXT: s_add_i32 s8, s8, s3
+; GFX1064-NEXT: v_mov_b32_e32 v0, s2
; GFX1064-NEXT: v_mov_b32_e32 v1, s8
; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: .LBB12_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0
-; GFX1064-NEXT: v_readfirstlane_b32 s4, v1
-; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5]
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v3
+; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[0:1], s6, v2, 0
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s7, v2, v[4:5]
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s0, v3
; GFX1064-NEXT: v_mov_b32_e32 v1, v4
-; GFX1064-NEXT: s_mov_b32 s2, -1
-; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
-; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: sub_i64_uniform:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB12_2
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_mul_i32 s6, s3, s5
-; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5
-; GFX1032-NEXT: s_mul_i32 s5, s2, s5
-; GFX1032-NEXT: s_add_i32 s7, s7, s6
-; GFX1032-NEXT: v_mov_b32_e32 v0, s5
-; GFX1032-NEXT: v_mov_b32_e32 v1, s7
+; GFX1032-NEXT: s_mul_i32 s2, s7, s1
+; GFX1032-NEXT: s_mul_hi_u32 s3, s6, s1
+; GFX1032-NEXT: s_mul_i32 s1, s6, s1
+; GFX1032-NEXT: s_add_i32 s3, s3, s2
+; GFX1032-NEXT: v_mov_b32_e32 v0, s1
+; GFX1032-NEXT: v_mov_b32_e32 v1, s3
; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: .LBB12_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
-; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s2, s2, v2, 0
-; GFX1032-NEXT: v_readfirstlane_b32 s4, v1
-; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s2, s3, v2, v[4:5]
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3
+; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s0, s6, v2, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s0, s7, v2, v[4:5]
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v3
; GFX1032-NEXT: v_mov_b32_e32 v1, v4
-; GFX1032-NEXT: s_mov_b32 s2, -1
-; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
-; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: sub_i64_uniform:
; GFX1164: ; %bb.0: ; %entry
-; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1164-NEXT: s_mov_b64 s[6:7], exec
-; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX1164-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s3, v0
; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1164-NEXT: s_cbranch_execz .LBB12_2
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_mul_i32 s7, s3, s6
-; GFX1164-NEXT: s_mul_hi_u32 s8, s2, s6
-; GFX1164-NEXT: s_mul_i32 s6, s2, s6
-; GFX1164-NEXT: s_add_i32 s8, s8, s7
-; GFX1164-NEXT: v_mov_b32_e32 v0, s6
+; GFX1164-NEXT: s_mul_i32 s3, s7, s2
+; GFX1164-NEXT: s_mul_hi_u32 s8, s6, s2
+; GFX1164-NEXT: s_mul_i32 s2, s6, s2
+; GFX1164-NEXT: s_add_i32 s8, s8, s3
+; GFX1164-NEXT: v_mov_b32_e32 v0, s2
; GFX1164-NEXT: v_mov_b32_e32 v1, s8
; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB12_2:
-; GFX1164-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX1164-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: v_readfirstlane_b32 s4, v1
+; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s6, v2, 0
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_waitcnt_depctr 0xfff
-; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5]
-; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v3
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s7, v2, v[4:5]
+; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s0, v3
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_mov_b32_e32 v1, v5
-; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc
-; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: sub_i64_uniform:
; GFX1132: ; %bb.0: ; %entry
-; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX1132-NEXT: s_mov_b32 s5, exec_lo
-; GFX1132-NEXT: s_mov_b32 s4, exec_lo
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0
+; GFX1132-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0
; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
; GFX1132-NEXT: s_cbranch_execz .LBB12_2
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5
+; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s2
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_mul_i32 s6, s3, s5
-; GFX1132-NEXT: s_mul_hi_u32 s7, s2, s5
-; GFX1132-NEXT: s_mul_i32 s5, s2, s5
-; GFX1132-NEXT: s_add_i32 s7, s7, s6
+; GFX1132-NEXT: s_mul_i32 s2, s7, s1
+; GFX1132-NEXT: s_mul_hi_u32 s3, s6, s1
+; GFX1132-NEXT: s_mul_i32 s1, s6, s1
+; GFX1132-NEXT: s_add_i32 s3, s3, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s7
+; GFX1132-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s3
; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB12_2:
-; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: v_readfirstlane_b32 s4, v1
+; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s6, v2, 0
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5]
-; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s7, v2, v[4:5]
+; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v3
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_mov_b32_e32 v1, v5
-; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo
-; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -2974,51 +2987,51 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_mov_b32 m0, -1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sub_i64_varying:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sub_i64_varying:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: sub_i64_varying:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1]
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -3078,13 +3091,13 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB14_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_and_b32_e32 v0, s4, v1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_and_b32_e32 v0, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: and_i32_varying:
@@ -3118,13 +3131,13 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB14_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_and_b32_e32 v0, s4, v1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_and_b32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: and_i32_varying:
@@ -3159,13 +3172,14 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB14_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_and_b32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_and_b32_e32 v0, s0, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: and_i32_varying:
@@ -3199,13 +3213,14 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB14_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_and_b32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_and_b32_e32 v0, s0, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: and_i32_varying:
@@ -3242,14 +3257,14 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB14_4:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_and_b32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_and_b32_e32 v0, s0, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -3286,14 +3301,14 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB14_4:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_and_b32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_and_b32_e32 v0, s0, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -3352,13 +3367,13 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB15_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_or_b32_e32 v0, s4, v1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_or_b32_e32 v0, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: or_i32_varying:
@@ -3392,13 +3407,13 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB15_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_or_b32_e32 v0, s4, v1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_or_b32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: or_i32_varying:
@@ -3433,13 +3448,14 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB15_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_or_b32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_or_b32_e32 v0, s0, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: or_i32_varying:
@@ -3473,13 +3489,14 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB15_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_or_b32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_or_b32_e32 v0, s0, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: or_i32_varying:
@@ -3516,14 +3533,14 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB15_4:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_or_b32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_or_b32_e32 v0, s0, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -3560,14 +3577,14 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB15_4:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_or_b32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_or_b32_e32 v0, s0, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -3626,13 +3643,13 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB16_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_xor_b32_e32 v0, s4, v1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_xor_b32_e32 v0, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: xor_i32_varying:
@@ -3666,13 +3683,13 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB16_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_xor_b32_e32 v0, s4, v1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_xor_b32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: xor_i32_varying:
@@ -3707,13 +3724,14 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB16_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_xor_b32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_xor_b32_e32 v0, s0, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: xor_i32_varying:
@@ -3747,13 +3765,14 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB16_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_xor_b32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_xor_b32_e32 v0, s0, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: xor_i32_varying:
@@ -3790,14 +3809,14 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB16_4:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_xor_b32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_xor_b32_e32 v0, s0, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -3834,14 +3853,14 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB16_4:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_xor_b32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_xor_b32_e32 v0, s0, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -3900,13 +3919,13 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB17_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_max_i32_e32 v0, s4, v1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_max_i32_e32 v0, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: max_i32_varying:
@@ -3940,13 +3959,13 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB17_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_max_i32_e32 v0, s4, v1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_max_i32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: max_i32_varying:
@@ -3981,13 +4000,14 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB17_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_max_i32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_max_i32_e32 v0, s0, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: max_i32_varying:
@@ -4021,13 +4041,14 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB17_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_max_i32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_max_i32_e32 v0, s0, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: max_i32_varying:
@@ -4064,14 +4085,14 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB17_4:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_max_i32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_max_i32_e32 v0, s0, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -4108,14 +4129,14 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB17_4:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_max_i32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_max_i32_e32 v0, s0, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -4180,21 +4201,21 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB18_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: v_bfrev_b32_e32 v0, 1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s5, v1
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v2, s5
+; GFX8-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: max_i64_constant:
@@ -4213,21 +4234,21 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB18_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_bfrev_b32_e32 v0, 1
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s5, v1
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: max_i64_constant:
@@ -4248,18 +4269,19 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB18_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX1064-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
-; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s1, vcc
+; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: max_i64_constant:
@@ -4279,18 +4301,19 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB18_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo
-; GFX1032-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[0:1]
+; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s1, vcc_lo
+; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: max_i64_constant:
@@ -4311,19 +4334,19 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB18_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc
; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_cmp_gt_i64_e32 vcc, s[2:3], v[0:1]
-; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s1, vcc
+; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -4344,19 +4367,19 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB18_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x80000000, vcc_lo
; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[0:1]
+; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s1, vcc_lo
+; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -4414,13 +4437,13 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB19_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_min_i32_e32 v0, s4, v1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_min_i32_e32 v0, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: min_i32_varying:
@@ -4454,13 +4477,13 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB19_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_min_i32_e32 v0, s4, v1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_min_i32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: min_i32_varying:
@@ -4495,13 +4518,14 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB19_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_min_i32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_min_i32_e32 v0, s0, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: min_i32_varying:
@@ -4535,13 +4559,14 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB19_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_min_i32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_min_i32_e32 v0, s0, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: min_i32_varying:
@@ -4578,14 +4603,14 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB19_4:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_min_i32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_min_i32_e32 v0, s0, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -4622,14 +4647,14 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB19_4:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_min_i32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_min_i32_e32 v0, s0, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -4694,21 +4719,21 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB20_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: v_bfrev_b32_e32 v0, -2
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s5, v1
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
-; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v2, s5
+; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: min_i64_constant:
@@ -4727,21 +4752,21 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB20_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_bfrev_b32_e32 v0, -2
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s5, v1
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
-; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: min_i64_constant:
@@ -4762,18 +4787,19 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB20_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
-; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
-; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s1, vcc
+; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: min_i64_constant:
@@ -4793,18 +4819,19 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB20_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo
-; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[0:1], v[0:1]
+; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s1, vcc_lo
+; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: min_i64_constant:
@@ -4825,19 +4852,19 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB20_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc
; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1]
-; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s1, vcc
+; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -4858,19 +4885,19 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB20_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 0x7fffffff, vcc_lo
; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_cmp_lt_i64_e32 vcc_lo, s[0:1], v[0:1]
+; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s1, vcc_lo
+; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -4928,13 +4955,13 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB21_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_max_u32_e32 v0, s4, v1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_max_u32_e32 v0, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: umax_i32_varying:
@@ -4968,13 +4995,13 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB21_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_max_u32_e32 v0, s4, v1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_max_u32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: umax_i32_varying:
@@ -5009,13 +5036,14 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB21_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_max_u32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_max_u32_e32 v0, s0, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: umax_i32_varying:
@@ -5049,13 +5077,14 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB21_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_max_u32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_max_u32_e32 v0, s0, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: umax_i32_varying:
@@ -5092,14 +5121,14 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB21_4:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_max_u32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_max_u32_e32 v0, s0, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -5136,14 +5165,14 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB21_4:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_max_u32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_max_u32_e32 v0, s0, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -5207,20 +5236,20 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB22_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: v_readfirstlane_b32 s5, v1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: umax_i64_constant:
@@ -5239,20 +5268,20 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB22_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: v_readfirstlane_b32 s5, v1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: umax_i64_constant:
@@ -5273,18 +5302,19 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB22_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
-; GFX1064-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
-; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc
+; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, s1, vcc
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: umax_i64_constant:
@@ -5304,18 +5334,19 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB22_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo
-; GFX1032-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
-; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc_lo
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1]
+; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo
+; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, s1, vcc_lo
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: umax_i64_constant:
@@ -5336,19 +5367,19 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB22_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
-; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc
+; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, s1, vcc
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -5369,19 +5400,19 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB22_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, 0
; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, 0, vcc_lo
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
-; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, s3, vcc_lo
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1]
+; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo
+; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, s1, vcc_lo
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -5439,13 +5470,13 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB23_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_min_u32_e32 v0, s4, v1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
+; GFX8-NEXT: v_min_u32_e32 v0, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: umin_i32_varying:
@@ -5479,13 +5510,13 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB23_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_min_u32_e32 v0, s4, v1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_min_u32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: umin_i32_varying:
@@ -5520,13 +5551,14 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB23_4:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_min_u32_e32 v0, s2, v1
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_min_u32_e32 v0, s0, v1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: umin_i32_varying:
@@ -5560,13 +5592,14 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB23_4:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_min_u32_e32 v0, s2, v1
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_min_u32_e32 v0, s0, v1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: umin_i32_varying:
@@ -5603,14 +5636,14 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB23_4:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_min_u32_e32 v0, s2, v1
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_min_u32_e32 v0, s0, v1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -5647,14 +5680,14 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB23_4:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_min_u32_e32 v0, s2, v1
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_min_u32_e32 v0, s0, v1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
@@ -5718,20 +5751,20 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB24_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: v_readfirstlane_b32 s5, v1
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_readfirstlane_b32 s1, v1
; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
; GFX8-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
-; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v2, s5
+; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: umin_i64_constant:
@@ -5750,20 +5783,20 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB24_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_readfirstlane_b32 s4, v0
-; GFX9-NEXT: v_readfirstlane_b32 s5, v1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: v_readfirstlane_b32 s1, v1
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: umin_i64_constant:
@@ -5784,18 +5817,19 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: .LBB24_2:
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1064-NEXT: s_mov_b32 null, 0
+; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
; GFX1064-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
-; GFX1064-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
-; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: s_mov_b32 s2, -1
+; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1064-NEXT: s_mov_b32 s6, -1
+; GFX1064-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX1064-NEXT: v_cndmask_b32_e64 v1, v1, s1, vcc
+; GFX1064-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: umin_i64_constant:
@@ -5815,18 +5849,19 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: .LBB24_2:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1032-NEXT: s_mov_b32 null, 0
+; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
; GFX1032-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo
-; GFX1032-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
-; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: s_mov_b32 s2, -1
+; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1032-NEXT: s_mov_b32 s6, -1
+; GFX1032-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[0:1]
+; GFX1032-NEXT: v_cndmask_b32_e64 v1, v1, s1, vcc_lo
+; GFX1032-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: umin_i64_constant:
@@ -5847,19 +5882,19 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB24_2:
; GFX1164-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1164-NEXT: v_readfirstlane_b32 s1, v1
; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
; GFX1164-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc
+; GFX1164-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1164-NEXT: s_mov_b32 s6, -1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
-; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc
-; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc
-; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: s_mov_b32 s2, -1
+; GFX1164-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX1164-NEXT: v_cndmask_b32_e64 v1, v1, s1, vcc
+; GFX1164-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
@@ -5880,19 +5915,19 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: buffer_gl0_inv
; GFX1132-NEXT: .LBB24_2:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1132-NEXT: v_readfirstlane_b32 s1, v1
; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
; GFX1132-NEXT: v_cndmask_b32_e64 v0, 5, -1, vcc_lo
+; GFX1132-NEXT: s_mov_b32 s7, 0x31016000
+; GFX1132-NEXT: s_mov_b32 s6, -1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
-; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo
-; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
-; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: s_mov_b32 s2, -1
+; GFX1132-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[0:1], v[0:1]
+; GFX1132-NEXT: v_cndmask_b32_e64 v1, v1, s1, vcc_lo
+; GFX1132-NEXT: v_cndmask_b32_e64 v0, v0, s0, vcc_lo
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index ca94d68f01917..aa5c480dfbeed 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -62,13 +62,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB0_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s2
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -90,13 +90,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB0_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: add_i32_constant:
@@ -118,13 +118,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W64-NEXT: .LBB0_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_constant:
@@ -145,13 +146,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W32-NEXT: .LBB0_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_constant:
@@ -174,14 +176,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB0_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -205,14 +207,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], 0 glc
; GFX11W32-NEXT: .LBB0_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -237,14 +239,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB0_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX12W64-NEXT: s_wait_kmcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -268,14 +270,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB0_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX12W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -335,14 +337,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX8-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB1_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -365,14 +367,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB1_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_add_u32_e32 v0, s2, v0
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: add_i32_uniform:
@@ -395,13 +397,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W64-NEXT: .LBB1_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3]
+; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v0, s[0:1]
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_uniform:
@@ -423,13 +426,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W32-NEXT: .LBB1_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s4, v1
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v0, s[4:5]
+; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1]
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_uniform:
@@ -453,14 +457,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB1_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3]
+; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[0:1]
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -485,14 +489,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
; GFX11W32-NEXT: .LBB1_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5]
+; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[0:1]
; GFX11W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -518,14 +522,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB1_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3]
+; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[0:1]
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -550,14 +554,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB1_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[4:5]
+; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[0:1]
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -611,13 +615,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB2_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -652,13 +656,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB2_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_add_u32_e32 v0, s2, v1
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: add_i32_varying_vdata:
@@ -692,13 +696,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: .LBB2_4:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W64-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_varying_vdata:
@@ -731,13 +736,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: .LBB2_4:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W32-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_varying_vdata:
@@ -773,14 +779,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB2_4:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -816,13 +822,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc
; GFX11W32-NEXT: .LBB2_4:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1
+; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s0, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -860,14 +866,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: buffer_atomic_add_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB2_4:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W64-NEXT: v_mov_b32_e32 v0, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX12W64-NEXT: s_wait_kmcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -903,13 +909,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB2_4:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1
+; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s0, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -937,12 +943,12 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX8-LABEL: add_i32_varying_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v2, 1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_atomic_add v2, v0, s[4:7], 0 offen glc
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -950,51 +956,54 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX9-LABEL: add_i32_varying_offset:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 1
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: add_i32_varying_offset:
; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 1
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 offen glc
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: add_i32_varying_offset:
; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 1
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 offen glc
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: add_i32_varying_offset:
; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 1
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1053,14 +1062,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB4_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1082,14 +1091,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB4_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: sub_i32_constant:
@@ -1111,14 +1120,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W64-NEXT: .LBB4_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: sub_i32_constant:
@@ -1139,14 +1149,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W32-NEXT: .LBB4_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_constant:
@@ -1169,15 +1180,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB4_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -1201,15 +1212,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], 0 glc
; GFX11W32-NEXT: .LBB4_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -1234,15 +1245,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB4_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX12W64-NEXT: s_wait_kmcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -1266,15 +1277,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB4_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -1334,14 +1345,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB5_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1364,14 +1375,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB5_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: sub_i32_uniform:
@@ -1394,14 +1405,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W64-NEXT: .LBB5_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: sub_i32_uniform:
@@ -1423,14 +1434,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W32-NEXT: .LBB5_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX10W32-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_uniform:
@@ -1454,15 +1465,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB5_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -1487,15 +1498,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
; GFX11W32-NEXT: .LBB5_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX11W32-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -1521,15 +1532,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB5_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX12W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -1554,15 +1565,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB5_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX12W32-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -1616,13 +1627,13 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
; GFX8-NEXT: .LBB6_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1657,13 +1668,13 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
; GFX9-NEXT: .LBB6_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v1
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: sub_i32_varying_vdata:
@@ -1697,13 +1708,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: .LBB6_4:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W64-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: sub_i32_varying_vdata:
@@ -1736,13 +1748,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: .LBB6_4:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W32-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_varying_vdata:
@@ -1778,14 +1791,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc
; GFX11W64-NEXT: .LBB6_4:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -1821,14 +1834,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc
; GFX11W32-NEXT: .LBB6_4:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W32-NEXT: v_mov_b32_e32 v0, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -1866,14 +1879,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: buffer_atomic_sub_u32 v0, off, s[8:11], null th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB6_4:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W64-NEXT: v_mov_b32_e32 v0, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX12W64-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX12W64-NEXT: s_wait_kmcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -1909,14 +1922,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: buffer_atomic_sub_u32 v0, off, s[4:7], null th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB6_4:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W32-NEXT: v_mov_b32_e32 v0, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX12W32-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -1944,12 +1957,12 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX8-LABEL: sub_i32_varying_offset:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v2, 1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_atomic_sub v2, v0, s[4:7], 0 offen glc
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -1957,51 +1970,54 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX9-LABEL: sub_i32_varying_offset:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 1
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sub_i32_varying_offset:
; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 1
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 offen glc
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: sub_i32_varying_offset:
; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 1
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 offen glc
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: sub_i32_varying_offset:
; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 1
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index 7e15c07f95269..783c5d4dd7ba9 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -64,13 +64,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX8-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
; GFX8-NEXT: .LBB0_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s2
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -93,13 +93,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX9-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
; GFX9-NEXT: .LBB0_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: add_i32_constant:
@@ -122,13 +122,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W64-NEXT: .LBB0_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_constant:
@@ -150,13 +151,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W32-NEXT: .LBB0_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_constant:
@@ -180,14 +182,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc
; GFX11W64-NEXT: .LBB0_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -212,14 +214,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], 0 idxen glc
; GFX11W32-NEXT: .LBB0_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -245,14 +247,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB0_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX12W64-NEXT: s_wait_kmcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -276,14 +278,14 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB0_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX12W32-NEXT: v_mad_u32_u24 v0, v0, 5, s0
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -345,14 +347,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX8-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
; GFX8-NEXT: .LBB1_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -376,14 +378,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX9-NEXT: buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
; GFX9-NEXT: .LBB1_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_add_u32_e32 v0, s2, v0
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: add_i32_uniform:
@@ -407,13 +409,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W64-NEXT: .LBB1_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s6, v0, s[2:3]
+; GFX10W64-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s6, v0, s[0:1]
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_uniform:
@@ -436,13 +439,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W32-NEXT: .LBB1_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s4, v1
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s2, s2, v0, s[4:5]
+; GFX10W32-NEXT: v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1]
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_uniform:
@@ -467,14 +471,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc
; GFX11W64-NEXT: .LBB1_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[2:3]
+; GFX11W64-NEXT: v_mad_u64_u32 v[1:2], null, s6, v0, s[0:1]
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -500,14 +504,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc
; GFX11W32-NEXT: .LBB1_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s4, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[4:5]
+; GFX11W32-NEXT: v_mad_u64_u32 v[1:2], null, s2, v0, s[0:1]
; GFX11W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -534,14 +538,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB1_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3]
+; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[0:1]
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -566,14 +570,14 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: buffer_atomic_add_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB1_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[4:5]
+; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[0:1]
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -629,13 +633,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc
; GFX8-NEXT: .LBB2_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -671,13 +675,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc
; GFX9-NEXT: .LBB2_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_add_u32_e32 v0, s2, v1
+; GFX9-NEXT: v_add_u32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: add_i32_varying_vdata:
@@ -712,13 +716,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: .LBB2_4:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX10W64-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W64-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: add_i32_varying_vdata:
@@ -752,13 +757,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: .LBB2_4:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX10W32-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W32-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_varying_vdata:
@@ -795,14 +801,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], 0 idxen glc
; GFX11W64-NEXT: .LBB2_4:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX11W64-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -839,13 +845,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: buffer_atomic_add_u32 v0, v2, s[4:7], 0 idxen glc
; GFX11W32-NEXT: .LBB2_4:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1
+; GFX11W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s0, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -884,14 +890,14 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: buffer_atomic_add_u32 v0, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB2_4:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W64-NEXT: v_mov_b32_e32 v0, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s2, v1
+; GFX12W64-NEXT: v_add_nc_u32_e32 v1, s0, v1
; GFX12W64-NEXT: s_wait_kmcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -928,13 +934,13 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: buffer_atomic_add_u32 v0, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB2_4:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s2, v1
+; GFX12W32-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v1, s0, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -962,12 +968,12 @@ define amdgpu_kernel void @add_i32_varying_vindex(ptr addrspace(1) %out, ptr add
; GFX8-LABEL: add_i32_varying_vindex:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v2, 1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_atomic_add v2, v0, s[4:7], 0 idxen glc
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -975,51 +981,54 @@ define amdgpu_kernel void @add_i32_varying_vindex(ptr addrspace(1) %out, ptr add
; GFX9-LABEL: add_i32_varying_vindex:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 1
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 idxen glc
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: add_i32_varying_vindex:
; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 1
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_atomic_add v1, v0, s[4:7], 0 idxen glc
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: add_i32_varying_vindex:
; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 1
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], 0 idxen glc
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: add_i32_varying_vindex:
; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 1
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_add_u32 v1, v0, s[4:7], null idxen th:TH_ATOMIC_RETURN
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1053,13 +1062,12 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX8-NEXT: s_mov_b32 s2, 0
; GFX8-NEXT: v_mov_b32_e32 v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v2, 1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -1070,13 +1078,13 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX9-NEXT: s_mov_b32 s2, 0
; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_dword v0, v2, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: add_i32_varying_offset:
@@ -1085,13 +1093,13 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX10-NEXT: s_mov_b32 s2, 0
; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 1
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v2, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11W64-LABEL: add_i32_varying_offset:
@@ -1100,13 +1108,13 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX11W64-NEXT: s_mov_b32 s2, 0
; GFX11W64-NEXT: v_mov_b32_e32 v1, v0
; GFX11W64-NEXT: v_mov_b32_e32 v0, s2
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: v_mov_b32_e32 v2, 1
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[4:7], 0 idxen offen glc
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v0, v2, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -1118,41 +1126,43 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11W32-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s2
; GFX11W32-NEXT: v_mov_b32_e32 v2, 1
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W32-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[4:7], 0 idxen offen glc
; GFX11W32-NEXT: v_mov_b32_e32 v0, 0
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v0, v2, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
;
; GFX12W64-LABEL: add_i32_varying_offset:
; GFX12W64: ; %bb.0: ; %entry
+; GFX12W64-NEXT: s_clause 0x1
; GFX12W64-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: v_mov_b32_e32 v1, v0
; GFX12W64-NEXT: v_mov_b32_e32 v0, 0
; GFX12W64-NEXT: v_mov_b32_e32 v2, 1
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v0, v2, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
;
; GFX12W32-LABEL: add_i32_varying_offset:
; GFX12W32: ; %bb.0: ; %entry
+; GFX12W32-NEXT: s_clause 0x1
; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 0
; GFX12W32-NEXT: v_mov_b32_e32 v2, 1
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_add_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v0, v2, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -1213,14 +1223,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX8-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
; GFX8-NEXT: .LBB5_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1243,14 +1253,14 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX9-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
; GFX9-NEXT: .LBB5_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: sub_i32_constant:
@@ -1273,14 +1283,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W64-NEXT: .LBB5_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: sub_i32_constant:
@@ -1302,14 +1313,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W32-NEXT: .LBB5_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_constant:
@@ -1333,15 +1345,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc
; GFX11W64-NEXT: .LBB5_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -1366,15 +1378,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[4:7], 0 idxen glc
; GFX11W32-NEXT: .LBB5_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -1400,15 +1412,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB5_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W64-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX12W64-NEXT: s_wait_kmcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -1432,15 +1444,15 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB5_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: v_mul_u32_u24_e32 v0, 5, v0
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
+; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -1502,14 +1514,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX8-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
; GFX8-NEXT: .LBB6_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1533,14 +1545,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX9-NEXT: buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
; GFX9-NEXT: .LBB6_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: sub_i32_uniform:
@@ -1564,14 +1576,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W64-NEXT: .LBB6_2:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX10W64-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: sub_i32_uniform:
@@ -1594,14 +1606,14 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX10W32-NEXT: .LBB6_2:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX10W32-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_uniform:
@@ -1626,15 +1638,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc
; GFX11W64-NEXT: .LBB6_2:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX11W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -1660,15 +1672,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX11W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc
; GFX11W32-NEXT: .LBB6_2:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX11W32-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -1695,15 +1707,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W64-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB6_2:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W64-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX12W64-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -1728,15 +1740,15 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
; GFX12W32-NEXT: buffer_atomic_sub_u32 v1, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB6_2:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: v_mul_lo_u32 v0, s2, v0
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v1
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s2, v0
-; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12W32-NEXT: v_sub_nc_u32_e32 v0, s0, v0
+; GFX12W32-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -1792,13 +1804,13 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX8-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc
; GFX8-NEXT: .LBB7_4:
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v0
-; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s2, v1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1834,13 +1846,13 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX9-NEXT: buffer_atomic_sub v0, v2, s[8:11], 0 idxen glc
; GFX9-NEXT: .LBB7_4:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_sub_u32_e32 v0, s2, v1
+; GFX9-NEXT: v_sub_u32_e32 v0, s0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10W64-LABEL: sub_i32_varying_vdata:
@@ -1875,13 +1887,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W64-NEXT: .LBB7_4:
; GFX10W64-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX10W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
-; GFX10W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W64-NEXT: s_mov_b32 null, 0
+; GFX10W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX10W64-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W64-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W64-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W64-NEXT: s_endpgm
;
; GFX10W32-LABEL: sub_i32_varying_vdata:
@@ -1915,13 +1928,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: .LBB7_4:
; GFX10W32-NEXT: s_waitcnt_depctr 0xffe3
; GFX10W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX10W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
-; GFX10W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10W32-NEXT: s_mov_b32 null, 0
+; GFX10W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX10W32-NEXT: v_mov_b32_e32 v0, 0
-; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX10W32-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10W32-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10W32-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10W32-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_varying_vdata:
@@ -1958,14 +1972,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W64-NEXT: buffer_atomic_sub_u32 v0, v2, s[8:11], 0 idxen glc
; GFX11W64-NEXT: .LBB7_4:
; GFX11W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX11W64-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -2002,14 +2016,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: buffer_atomic_sub_u32 v0, v2, s[4:7], 0 idxen glc
; GFX11W32-NEXT: .LBB7_4:
; GFX11W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX11W32-NEXT: v_mov_b32_e32 v0, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX11W32-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
@@ -2048,14 +2062,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W64-NEXT: buffer_atomic_sub_u32 v0, v2, s[8:11], null idxen th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: .LBB7_4:
; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W64-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W64-NEXT: v_mov_b32_e32 v0, 0
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX12W64-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX12W64-NEXT: s_wait_kmcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
@@ -2092,14 +2106,14 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: buffer_atomic_sub_u32 v0, v2, s[4:7], null idxen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: .LBB7_4:
; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: v_readfirstlane_b32 s2, v0
+; GFX12W32-NEXT: v_readfirstlane_b32 s0, v0
; GFX12W32-NEXT: v_mov_b32_e32 v0, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W32-NEXT: v_sub_nc_u32_e32 v1, s2, v1
+; GFX12W32-NEXT: v_sub_nc_u32_e32 v1, s0, v1
; GFX12W32-NEXT: s_wait_kmcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
@@ -2127,12 +2141,12 @@ define amdgpu_kernel void @sub_i32_varying_vindex(ptr addrspace(1) %out, ptr add
; GFX8-LABEL: sub_i32_varying_vindex:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v2, 1
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_atomic_sub v2, v0, s[4:7], 0 idxen glc
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -2140,51 +2154,54 @@ define amdgpu_kernel void @sub_i32_varying_vindex(ptr addrspace(1) %out, ptr add
; GFX9-LABEL: sub_i32_varying_vindex:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 1
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 idxen glc
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sub_i32_varying_vindex:
; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 1
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_atomic_sub v1, v0, s[4:7], 0 idxen glc
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: sub_i32_varying_vindex:
; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 1
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], 0 idxen glc
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: sub_i32_varying_vindex:
; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 1
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: buffer_atomic_sub_u32 v1, v0, s[4:7], null idxen th:TH_ATOMIC_RETURN
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2218,13 +2235,12 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX8-NEXT: s_mov_b32 s2, 0
; GFX8-NEXT: v_mov_b32_e32 v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v2, 1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -2235,13 +2251,13 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX9-NEXT: s_mov_b32 s2, 0
; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_dword v0, v2, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sub_i32_varying_offset:
@@ -2250,13 +2266,13 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX10-NEXT: s_mov_b32 s2, 0
; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 1
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v2, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11W64-LABEL: sub_i32_varying_offset:
@@ -2265,13 +2281,13 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX11W64-NEXT: s_mov_b32 s2, 0
; GFX11W64-NEXT: v_mov_b32_e32 v1, v0
; GFX11W64-NEXT: v_mov_b32_e32 v0, s2
+; GFX11W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W64-NEXT: v_mov_b32_e32 v2, 1
-; GFX11W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[4:7], 0 idxen offen glc
; GFX11W64-NEXT: v_mov_b32_e32 v0, 0
; GFX11W64-NEXT: s_waitcnt vmcnt(0)
-; GFX11W64-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11W64-NEXT: global_store_b32 v0, v2, s[2:3]
; GFX11W64-NEXT: s_nop 0
; GFX11W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W64-NEXT: s_endpgm
@@ -2283,41 +2299,43 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
; GFX11W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11W32-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s2
; GFX11W32-NEXT: v_mov_b32_e32 v2, 1
-; GFX11W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W32-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[4:7], 0 idxen offen glc
; GFX11W32-NEXT: v_mov_b32_e32 v0, 0
; GFX11W32-NEXT: s_waitcnt vmcnt(0)
-; GFX11W32-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11W32-NEXT: global_store_b32 v0, v2, s[2:3]
; GFX11W32-NEXT: s_nop 0
; GFX11W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11W32-NEXT: s_endpgm
;
; GFX12W64-LABEL: sub_i32_varying_offset:
; GFX12W64: ; %bb.0: ; %entry
+; GFX12W64-NEXT: s_clause 0x1
; GFX12W64-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX12W64-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W64-NEXT: v_mov_b32_e32 v1, v0
; GFX12W64-NEXT: v_mov_b32_e32 v0, 0
; GFX12W64-NEXT: v_mov_b32_e32 v2, 1
-; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN
; GFX12W64-NEXT: s_wait_loadcnt 0x0
-; GFX12W64-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX12W64-NEXT: global_store_b32 v0, v2, s[2:3]
; GFX12W64-NEXT: s_nop 0
; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W64-NEXT: s_endpgm
;
; GFX12W32-LABEL: sub_i32_varying_offset:
; GFX12W32: ; %bb.0: ; %entry
+; GFX12W32-NEXT: s_clause 0x1
; GFX12W32-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
+; GFX12W32-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12W32-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 0
; GFX12W32-NEXT: v_mov_b32_e32 v2, 1
-; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: buffer_atomic_sub_u32 v2, v[0:1], s[4:7], null idxen offen th:TH_ATOMIC_RETURN
; GFX12W32-NEXT: s_wait_loadcnt 0x0
-; GFX12W32-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX12W32-NEXT: global_store_b32 v0, v2, s[2:3]
; GFX12W32-NEXT: s_nop 0
; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12W32-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
index ad6009e378178..d74623a9a8b30 100644
--- a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
@@ -59,12 +59,12 @@ define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s6
; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: flat_store_b32 v[0:1], v2
; GFX12-SDAG-NEXT: s_endpgm
@@ -73,12 +73,12 @@ define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v1, s5
; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: flat_store_b32 v[0:1], v2
; GFX12-GISEL-NEXT: s_endpgm
@@ -140,14 +140,14 @@ entry:
define amdgpu_kernel void @global_atomic_cond_sub_rtn_u32(ptr addrspace(1) %addr, i32 %in, ptr addrspace(1) %use) {
; GFX12-SDAG-LABEL: global_atomic_cond_sub_rtn_u32:
; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v1, v0, v1, s[4:5] offset:16 th:TH_ATOMIC_RETURN
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -156,12 +156,12 @@ define amdgpu_kernel void @global_atomic_cond_sub_rtn_u32(ptr addrspace(1) %addr
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[4:5] offset:16 th:TH_ATOMIC_RETURN
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -175,21 +175,21 @@ entry:
define amdgpu_kernel void @ds_cond_sub_no_rtn_u32(ptr addrspace(3) %addr, i32 %in) {
; GFX12-SDAG-LABEL: ds_cond_sub_no_rtn_u32:
; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -16
+; GFX12-SDAG-NEXT: s_add_co_i32 s0, s2, -16
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s0
; GFX12-SDAG-NEXT: ds_cond_sub_rtn_u32 v0, v0, v1
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: ds_cond_sub_no_rtn_u32:
; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, -16
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s0
; GFX12-GISEL-NEXT: ds_cond_sub_rtn_u32 v0, v0, v1
; GFX12-GISEL-NEXT: s_endpgm
entry:
@@ -201,21 +201,21 @@ entry:
define amdgpu_kernel void @ds_cond_sub_no_rtn_u32_forced(ptr addrspace(3) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
; GFX12-SDAG-LABEL: ds_cond_sub_no_rtn_u32_forced:
; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -16
+; GFX12-SDAG-NEXT: s_add_co_i32 s0, s2, -16
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s0
; GFX12-SDAG-NEXT: ds_cond_sub_u32 v0, v1
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: ds_cond_sub_no_rtn_u32_forced:
; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, -16
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s0
; GFX12-GISEL-NEXT: ds_cond_sub_u32 v0, v1
; GFX12-GISEL-NEXT: s_endpgm
entry:
@@ -227,22 +227,22 @@ entry:
define amdgpu_kernel void @ds_cond_sub_rtn_u32(ptr addrspace(3) %addr, i32 %in, ptr addrspace(3) %use) {
; GFX12-SDAG-LABEL: ds_cond_sub_rtn_u32:
; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-SDAG-NEXT: ds_cond_sub_rtn_u32 v0, v0, v1 offset:16
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s6
; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
; GFX12-SDAG-NEXT: ds_store_b32 v1, v0
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: ds_cond_sub_rtn_u32:
; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s4
; GFX12-GISEL-NEXT: ds_cond_sub_rtn_u32 v0, v1, v0 offset:16
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s6
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
; GFX12-GISEL-NEXT: ds_store_b32 v1, v0
; GFX12-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/bfe-combine.ll b/llvm/test/CodeGen/AMDGPU/bfe-combine.ll
index 0f20ed1320dad..1b277c04b7e6d 100644
--- a/llvm/test/CodeGen/AMDGPU/bfe-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfe-combine.ll
@@ -6,36 +6,36 @@
define amdgpu_kernel void @bfe_combine8(ptr addrspace(1) nocapture %arg, i32 %x) {
; VI-LABEL: bfe_combine8:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; VI-NEXT: v_bfe_u32 v0, v0, 8, 8
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v2, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; VI-SDWA-LABEL: bfe_combine8:
; VI-SDWA: ; %bb.0:
-; VI-SDWA-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDWA-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDWA-NEXT: v_mov_b32_e32 v1, 2
; VI-SDWA-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; VI-SDWA-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; VI-SDWA-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-SDWA-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-SDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDWA-NEXT: flat_load_dword v2, v[0:1]
-; VI-SDWA-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDWA-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDWA-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDWA-NEXT: v_mov_b32_e32 v1, s3
; VI-SDWA-NEXT: s_waitcnt vmcnt(0)
; VI-SDWA-NEXT: flat_store_dword v[0:1], v2
; VI-SDWA-NEXT: s_endpgm
@@ -71,40 +71,40 @@ define amdgpu_kernel void @bfe_combine8(ptr addrspace(1) nocapture %arg, i32 %x)
define amdgpu_kernel void @bfe_combine16(ptr addrspace(1) nocapture %arg, i32 %x) {
; VI-LABEL: bfe_combine16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; VI-NEXT: v_bfe_u32 v0, v0, 16, 16
; VI-NEXT: v_lshlrev_b32_e32 v0, 15, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v2, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
; VI-NEXT: flat_load_dword v2, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; VI-SDWA-LABEL: bfe_combine16:
; VI-SDWA: ; %bb.0:
-; VI-SDWA-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDWA-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDWA-NEXT: v_mov_b32_e32 v1, 15
; VI-SDWA-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; VI-SDWA-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-SDWA-NEXT: v_mov_b32_e32 v1, 0
; VI-SDWA-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
-; VI-SDWA-NEXT: v_mov_b32_e32 v2, s1
-; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-SDWA-NEXT: v_mov_b32_e32 v2, s3
+; VI-SDWA-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-SDWA-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
; VI-SDWA-NEXT: flat_load_dword v2, v[0:1]
-; VI-SDWA-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDWA-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDWA-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDWA-NEXT: v_mov_b32_e32 v1, s3
; VI-SDWA-NEXT: s_waitcnt vmcnt(0)
; VI-SDWA-NEXT: flat_store_dword v[0:1], v2
; VI-SDWA-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
index 1639ec6512208..15cd6f79d7818 100644
--- a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
@@ -23,18 +23,18 @@ define amdgpu_kernel void @v_ubfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: v_ubfe_sub_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_bfe_u32 v2, v3, 0, v4
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -78,18 +78,18 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p
;
; VI-LABEL: v_ubfe_sub_multi_use_shl_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3
@@ -221,18 +221,18 @@ define amdgpu_kernel void @v_sbfe_sub_i32(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: v_sbfe_sub_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_bfe_i32 v2, v3, 0, v4
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -276,18 +276,18 @@ define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(ptr addrspace(1) %out, p
;
; VI-LABEL: v_sbfe_sub_multi_use_shl_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
; VI-NEXT: v_lshlrev_b32_e32 v3, v2, v3
@@ -418,14 +418,14 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_uniform_i32(ptr addrspace(1) %out,
; VI-LABEL: s_sbfe_or_shl_shl_uniform_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[6:7], 0x0
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_load_dword s1, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_or_b32 s0, s2, s0
+; VI-NEXT: s_or_b32 s0, s0, s1
; VI-NEXT: s_bfe_i32 s0, s0, 0xf0000
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -463,16 +463,16 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_nonuniform_i32(ptr addrspace(1) %ou
; VI-LABEL: s_sbfe_or_shl_shl_nonuniform_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[6:7], 0x0
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_load_dword s1, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b32 s1, s2, 17
-; VI-NEXT: s_lshl_b32 s0, s0, 19
-; VI-NEXT: s_or_b32 s0, s1, s0
+; VI-NEXT: s_lshl_b32 s0, s0, 17
+; VI-NEXT: s_lshl_b32 s1, s1, 19
+; VI-NEXT: s_or_b32 s0, s0, s1
; VI-NEXT: s_ashr_i32 s0, s0, 17
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -510,16 +510,16 @@ define amdgpu_kernel void @s_sbfe_or_shl_shl_toosmall_i32(ptr addrspace(1) %out,
; VI-LABEL: s_sbfe_or_shl_shl_toosmall_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[6:7], 0x0
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_load_dword s1, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b32 s1, s2, 17
-; VI-NEXT: s_lshl_b32 s0, s0, 16
-; VI-NEXT: s_or_b32 s0, s1, s0
+; VI-NEXT: s_lshl_b32 s0, s0, 17
+; VI-NEXT: s_lshl_b32 s1, s1, 16
+; VI-NEXT: s_or_b32 s0, s0, s1
; VI-NEXT: s_ashr_i32 s0, s0, 17
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
diff --git a/llvm/test/CodeGen/AMDGPU/bfi_int.ll b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
index 7b8eaccaa4142..31b5b168e3817 100644
--- a/llvm/test/CodeGen/AMDGPU/bfi_int.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
@@ -1426,11 +1426,11 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
; GFX8-LABEL: s_bitselect_i64_pat_0:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7]
-; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-NEXT: s_and_b64 s[0:1], s[4:5], s[6:7]
+; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
+; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX8-NEXT: s_add_u32 s0, s0, 10
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -1442,11 +1442,11 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7]
-; GFX10-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX10-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX10-NEXT: s_and_b64 s[0:1], s[4:5], s[6:7]
+; GFX10-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
+; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX10-NEXT: s_add_u32 s0, s0, 10
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
@@ -1457,11 +1457,11 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
; GFX8-GISEL-LABEL: s_bitselect_i64_pat_0:
; GFX8-GISEL: ; %bb.0:
; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7]
-; GFX8-GISEL-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX8-GISEL-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-GISEL-NEXT: s_and_b64 s[0:1], s[4:5], s[6:7]
+; GFX8-GISEL-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
+; GFX8-GISEL-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX8-GISEL-NEXT: s_add_u32 s0, s0, 10
; GFX8-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -1473,11 +1473,11 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7]
-; GFX10-GISEL-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX10-GISEL-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX10-GISEL-NEXT: s_and_b64 s[0:1], s[4:5], s[6:7]
+; GFX10-GISEL-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
+; GFX10-GISEL-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 10
; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -1514,11 +1514,11 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
; GFX8-LABEL: s_bitselect_i64_pat_1:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
-; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
-; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-NEXT: s_xor_b64 s[0:1], s[4:5], s[2:3]
+; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
; GFX8-NEXT: s_add_u32 s0, s0, 10
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -1530,11 +1530,11 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
-; GFX10-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
-; GFX10-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
+; GFX10-NEXT: s_xor_b64 s[0:1], s[4:5], s[2:3]
+; GFX10-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
+; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
; GFX10-NEXT: s_add_u32 s0, s0, 10
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
@@ -1545,11 +1545,11 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
; GFX8-GISEL-LABEL: s_bitselect_i64_pat_1:
; GFX8-GISEL: ; %bb.0:
; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
-; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
-; GFX8-GISEL-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-GISEL-NEXT: s_xor_b64 s[0:1], s[4:5], s[2:3]
+; GFX8-GISEL-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-GISEL-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
; GFX8-GISEL-NEXT: s_add_u32 s0, s0, 10
; GFX8-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -1561,11 +1561,11 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
-; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
-; GFX10-GISEL-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
+; GFX10-GISEL-NEXT: s_xor_b64 s[0:1], s[4:5], s[2:3]
+; GFX10-GISEL-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
+; GFX10-GISEL-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 10
; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -1602,11 +1602,11 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
; GFX8-LABEL: s_bitselect_i64_pat_2:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
-; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
-; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-NEXT: s_xor_b64 s[0:1], s[4:5], s[2:3]
+; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
; GFX8-NEXT: s_add_u32 s0, s0, 10
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -1618,11 +1618,11 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
-; GFX10-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
-; GFX10-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
+; GFX10-NEXT: s_xor_b64 s[0:1], s[4:5], s[2:3]
+; GFX10-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
+; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
; GFX10-NEXT: s_add_u32 s0, s0, 10
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
@@ -1633,11 +1633,11 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
; GFX8-GISEL-LABEL: s_bitselect_i64_pat_2:
; GFX8-GISEL: ; %bb.0:
; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
-; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
-; GFX8-GISEL-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-GISEL-NEXT: s_xor_b64 s[0:1], s[4:5], s[2:3]
+; GFX8-GISEL-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
+; GFX8-GISEL-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
; GFX8-GISEL-NEXT: s_add_u32 s0, s0, 10
; GFX8-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -1649,11 +1649,11 @@ define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
-; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
-; GFX10-GISEL-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
+; GFX10-GISEL-NEXT: s_xor_b64 s[0:1], s[4:5], s[2:3]
+; GFX10-GISEL-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
+; GFX10-GISEL-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 10
; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -1691,12 +1691,12 @@ define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) {
; GFX8-LABEL: s_bfi_sha256_ma_i64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_and_b64 s[2:3], s[4:5], s[0:1]
-; GFX8-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1]
-; GFX8-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1]
-; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-NEXT: s_and_b64 s[0:1], s[4:5], s[2:3]
+; GFX8-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3]
+; GFX8-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3]
+; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX8-NEXT: s_add_u32 s0, s0, 10
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -1708,12 +1708,12 @@ define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) {
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_or_b64 s[2:3], s[4:5], s[0:1]
-; GFX10-NEXT: s_and_b64 s[0:1], s[4:5], s[0:1]
-; GFX10-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3]
-; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX10-NEXT: s_or_b64 s[0:1], s[4:5], s[2:3]
+; GFX10-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3]
+; GFX10-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1]
+; GFX10-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
; GFX10-NEXT: s_add_u32 s0, s0, 10
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
@@ -1724,12 +1724,12 @@ define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) {
; GFX8-GISEL-LABEL: s_bfi_sha256_ma_i64:
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[4:5], s[0:1]
-; GFX8-GISEL-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1]
-; GFX8-GISEL-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1]
-; GFX8-GISEL-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-GISEL-NEXT: s_and_b64 s[0:1], s[4:5], s[2:3]
+; GFX8-GISEL-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3]
+; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3]
+; GFX8-GISEL-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX8-GISEL-NEXT: s_add_u32 s0, s0, 10
; GFX8-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -1741,12 +1741,12 @@ define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) {
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_or_b64 s[2:3], s[4:5], s[0:1]
-; GFX10-GISEL-NEXT: s_and_b64 s[0:1], s[4:5], s[0:1]
-; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3]
-; GFX10-GISEL-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX10-GISEL-NEXT: s_or_b64 s[0:1], s[4:5], s[2:3]
+; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3]
+; GFX10-GISEL-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1]
+; GFX10-GISEL-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 10
; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/bfm.ll b/llvm/test/CodeGen/AMDGPU/bfm.ll
index 8b2f66b438f14..935909e85f09b 100644
--- a/llvm/test/CodeGen/AMDGPU/bfm.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfm.ll
@@ -48,13 +48,13 @@ define amdgpu_kernel void @s_bfm_pattern_simple(ptr addrspace(1) %out, i32 %x) #
;
; VI-LABEL: s_bfm_pattern_simple:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bfm_b32 s2, s2, 0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_bfm_b32 s0, s4, 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%a = shl i32 1, %x
diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll
index 49ec09ddb7770..6c4791d6c65f8 100644
--- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll
@@ -34,41 +34,41 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) #
;
; FLAT-LABEL: s_brev_i16:
; FLAT: ; %bb.0:
-; FLAT-NEXT: s_load_dword s4, s[0:1], 0x2c
-; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; FLAT-NEXT: s_mov_b32 s3, 0xf000
-; FLAT-NEXT: s_mov_b32 s2, -1
+; FLAT-NEXT: s_load_dword s2, s[0:1], 0x2c
+; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; FLAT-NEXT: s_mov_b32 s7, 0xf000
+; FLAT-NEXT: s_mov_b32 s6, -1
; FLAT-NEXT: s_waitcnt lgkmcnt(0)
-; FLAT-NEXT: s_brev_b32 s4, s4
-; FLAT-NEXT: s_lshr_b32 s4, s4, 16
-; FLAT-NEXT: v_mov_b32_e32 v0, s4
-; FLAT-NEXT: buffer_store_short v0, off, s[0:3], 0
+; FLAT-NEXT: s_brev_b32 s0, s2
+; FLAT-NEXT: s_lshr_b32 s0, s0, 16
+; FLAT-NEXT: v_mov_b32_e32 v0, s0
+; FLAT-NEXT: buffer_store_short v0, off, s[4:7], 0
; FLAT-NEXT: s_endpgm
;
; GISEL-LABEL: s_brev_i16:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: s_and_b32 s2, s2, 0xffff
-; GISEL-NEXT: s_brev_b32 s2, s2
-; GISEL-NEXT: s_lshr_b32 s2, s2, 16
-; GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GISEL-NEXT: v_mov_b32_e32 v2, s2
-; GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-NEXT: s_and_b32 s0, s4, 0xffff
+; GISEL-NEXT: s_brev_b32 s0, s0
+; GISEL-NEXT: s_lshr_b32 s0, s0, 16
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-NEXT: v_mov_b32_e32 v1, s3
; GISEL-NEXT: flat_store_short v[0:1], v2
; GISEL-NEXT: s_endpgm
;
; GFX11-FLAT-LABEL: s_brev_i16:
; GFX11-FLAT: ; %bb.0:
; GFX11-FLAT-NEXT: s_clause 0x1
-; GFX11-FLAT-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLAT-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-FLAT-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLAT-NEXT: s_brev_b32 s2, s2
+; GFX11-FLAT-NEXT: s_brev_b32 s0, s4
; GFX11-FLAT-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-FLAT-NEXT: global_store_d16_hi_b16 v0, v1, s[0:1]
+; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-FLAT-NEXT: global_store_d16_hi_b16 v0, v1, s[2:3]
; GFX11-FLAT-NEXT: s_nop 0
; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLAT-NEXT: s_endpgm
@@ -76,17 +76,17 @@ define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) #
; GFX11-GISEL-LABEL: s_brev_i16:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_clause 0x1
-; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-GISEL-NEXT: s_and_b32 s0, s4, 0xffff
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT: s_brev_b32 s2, s2
-; GFX11-GISEL-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-GISEL-NEXT: s_brev_b32 s0, s0
+; GFX11-GISEL-NEXT: s_lshr_b32 s0, s0, 16
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-GISEL-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-GISEL-NEXT: global_store_b16 v1, v0, s[2:3]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -199,25 +199,25 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) #
;
; FLAT-LABEL: s_brev_i32:
; FLAT: ; %bb.0:
-; FLAT-NEXT: s_load_dword s4, s[0:1], 0x2c
-; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; FLAT-NEXT: s_mov_b32 s3, 0xf000
-; FLAT-NEXT: s_mov_b32 s2, -1
+; FLAT-NEXT: s_load_dword s2, s[0:1], 0x2c
+; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; FLAT-NEXT: s_mov_b32 s7, 0xf000
+; FLAT-NEXT: s_mov_b32 s6, -1
; FLAT-NEXT: s_waitcnt lgkmcnt(0)
-; FLAT-NEXT: s_brev_b32 s4, s4
-; FLAT-NEXT: v_mov_b32_e32 v0, s4
-; FLAT-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; FLAT-NEXT: s_brev_b32 s0, s2
+; FLAT-NEXT: v_mov_b32_e32 v0, s0
+; FLAT-NEXT: buffer_store_dword v0, off, s[4:7], 0
; FLAT-NEXT: s_endpgm
;
; GISEL-LABEL: s_brev_i32:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: s_brev_b32 s2, s2
-; GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GISEL-NEXT: v_mov_b32_e32 v2, s2
-; GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-NEXT: s_brev_b32 s0, s4
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-NEXT: v_mov_b32_e32 v1, s3
; GISEL-NEXT: flat_store_dword v[0:1], v2
; GISEL-NEXT: s_endpgm
;
@@ -225,14 +225,14 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) #
; GFX11-FLAT: ; %bb.0:
; GFX11-FLAT-NEXT: s_clause 0x1
; GFX11-FLAT-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FLAT-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-FLAT-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FLAT-NEXT: s_mov_b32 s6, -1
; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLAT-NEXT: s_brev_b32 s2, s2
+; GFX11-FLAT-NEXT: s_brev_b32 s0, s2
; GFX11-FLAT-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FLAT-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-FLAT-NEXT: s_mov_b32 s2, -1
-; GFX11-FLAT-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-FLAT-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-FLAT-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-FLAT-NEXT: s_nop 0
; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLAT-NEXT: s_endpgm
@@ -240,14 +240,14 @@ define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) #
; GFX11-GISEL-LABEL: s_brev_i32:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_clause 0x1
-; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_brev_b32 s2, s2
+; GFX11-GISEL-NEXT: s_brev_b32 s0, s4
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -702,17 +702,17 @@ define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64>
; FLAT-LABEL: s_brev_v2i64:
; FLAT: ; %bb.0:
; FLAT-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; FLAT-NEXT: s_mov_b32 s3, 0xf000
-; FLAT-NEXT: s_mov_b32 s2, -1
+; FLAT-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; FLAT-NEXT: s_mov_b32 s11, 0xf000
+; FLAT-NEXT: s_mov_b32 s10, -1
; FLAT-NEXT: s_waitcnt lgkmcnt(0)
-; FLAT-NEXT: s_brev_b64 s[6:7], s[6:7]
-; FLAT-NEXT: s_brev_b64 s[4:5], s[4:5]
-; FLAT-NEXT: v_mov_b32_e32 v0, s4
-; FLAT-NEXT: v_mov_b32_e32 v1, s5
-; FLAT-NEXT: v_mov_b32_e32 v2, s6
-; FLAT-NEXT: v_mov_b32_e32 v3, s7
-; FLAT-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; FLAT-NEXT: s_brev_b64 s[0:1], s[6:7]
+; FLAT-NEXT: s_brev_b64 s[2:3], s[4:5]
+; FLAT-NEXT: v_mov_b32_e32 v0, s2
+; FLAT-NEXT: v_mov_b32_e32 v1, s3
+; FLAT-NEXT: v_mov_b32_e32 v2, s0
+; FLAT-NEXT: v_mov_b32_e32 v3, s1
+; FLAT-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
; FLAT-NEXT: s_endpgm
;
; GISEL-LABEL: s_brev_v2i64:
@@ -735,15 +735,15 @@ define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64>
; GFX11-FLAT: ; %bb.0:
; GFX11-FLAT-NEXT: s_clause 0x1
; GFX11-FLAT-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX11-FLAT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLAT-NEXT: s_load_b64 s[8:9], s[0:1], 0x24
+; GFX11-FLAT-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-FLAT-NEXT: s_mov_b32 s10, -1
; GFX11-FLAT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLAT-NEXT: s_brev_b64 s[2:3], s[4:5]
-; GFX11-FLAT-NEXT: s_brev_b64 s[4:5], s[6:7]
-; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-FLAT-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
-; GFX11-FLAT-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-FLAT-NEXT: s_mov_b32 s2, -1
-; GFX11-FLAT-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-FLAT-NEXT: s_brev_b64 s[0:1], s[4:5]
+; GFX11-FLAT-NEXT: s_brev_b64 s[2:3], s[6:7]
+; GFX11-FLAT-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-FLAT-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-FLAT-NEXT: buffer_store_b128 v[0:3], off, s[8:11], 0
; GFX11-FLAT-NEXT: s_nop 0
; GFX11-FLAT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLAT-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
index 3dbbb877918ad..8bee436066159 100644
--- a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
@@ -137,42 +137,42 @@ define amdgpu_kernel void @br_cc_f16_imm_a(
;
; VI-LABEL: br_cc_f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
-; VI-NEXT: s_mov_b32 s2, s6
-; VI-NEXT: s_mov_b32 s3, s7
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_nlt_f16_e32 vcc, 0.5, v0
; VI-NEXT: s_cbranch_vccnz .LBB1_2
; VI-NEXT: ; %bb.1: ; %one
; VI-NEXT: v_mov_b32_e32 v0, 0x3800
; VI-NEXT: .LBB1_2: ; %two
-; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: br_cc_f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s6, -1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s4, s2
-; GFX11-NEXT: s_mov_b32 s5, s3
-; GFX11-NEXT: buffer_load_u16 v0, off, s[4:7], 0
+; GFX11-NEXT: s_mov_b32 s0, s6
+; GFX11-NEXT: s_mov_b32 s1, s7
+; GFX11-NEXT: buffer_load_u16 v0, off, s[0:3], 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0.5, v0
; GFX11-NEXT: s_cbranch_vccnz .LBB1_2
; GFX11-NEXT: ; %bb.1: ; %one
; GFX11-NEXT: v_mov_b32_e32 v0, 0x3800
; GFX11-NEXT: .LBB1_2: ; %two
-; GFX11-NEXT: s_mov_b32 s2, s6
-; GFX11-NEXT: s_mov_b32 s3, s7
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT: s_mov_b32 s6, s2
+; GFX11-NEXT: s_mov_b32 s7, s3
+; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -221,44 +221,44 @@ define amdgpu_kernel void @br_cc_f16_imm_b(
;
; VI-LABEL: br_cc_f16_imm_b:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
-; VI-NEXT: s_mov_b32 s2, s6
-; VI-NEXT: s_mov_b32 s3, s7
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_ngt_f16_e32 vcc, 0.5, v0
; VI-NEXT: s_cbranch_vccnz .LBB2_2
; VI-NEXT: ; %bb.1: ; %one
-; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
; VI-NEXT: .LBB2_2: ; %two
; VI-NEXT: v_mov_b32_e32 v0, 0x3800
-; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: br_cc_f16_imm_b:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s6, -1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s4, s2
-; GFX11-NEXT: s_mov_b32 s5, s3
-; GFX11-NEXT: buffer_load_u16 v0, off, s[4:7], 0
+; GFX11-NEXT: s_mov_b32 s0, s6
+; GFX11-NEXT: s_mov_b32 s1, s7
+; GFX11-NEXT: buffer_load_u16 v0, off, s[0:3], 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_ngt_f16_e32 vcc_lo, 0.5, v0
; GFX11-NEXT: s_cbranch_vccz .LBB2_2
; GFX11-NEXT: ; %bb.1: ; %two
; GFX11-NEXT: v_mov_b32_e32 v0, 0x3800
; GFX11-NEXT: .LBB2_2: ; %one
-; GFX11-NEXT: s_mov_b32 s2, s6
-; GFX11-NEXT: s_mov_b32 s3, s7
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT: s_mov_b32 s6, s2
+; GFX11-NEXT: s_mov_b32 s7, s3
+; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index 384715a849c1e..b8d9878270327 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -15,7 +15,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $sgpr10_sgpr11 = COPY $sgpr8_sgpr9
; GFX90A-NEXT: renamable $vgpr31 = COPY $vgpr0, implicit $exec
; GFX90A-NEXT: renamable $sgpr33 = S_LOAD_DWORD_IMM renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s32) from %ir.arg4.kernarg.offset.align.down, align 8, addrspace 4)
- ; GFX90A-NEXT: renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4)
+ ; GFX90A-NEXT: early-clobber renamable $sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX4_IMM_ec renamable $sgpr6_sgpr7, 24, 0 :: (dereferenceable invariant load (s128) from %ir.arg6.kernarg.offset.align.down, align 8, addrspace 4)
; GFX90A-NEXT: renamable $sgpr15 = S_LOAD_DWORD_IMM renamable $sgpr6_sgpr7, 40, 0 :: (dereferenceable invariant load (s32) from %ir.arg6.kernarg.offset.align.down + 16, align 8, addrspace 4)
; GFX90A-NEXT: renamable $sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
; GFX90A-NEXT: renamable $sgpr54_sgpr55 = S_LOAD_DWORDX2_IMM renamable $sgpr6_sgpr7, 16, 0 :: (dereferenceable invariant load (s64) from %ir.arg.kernarg.offset1 + 16, align 16, addrspace 4)
diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll
index e4c7df385d861..134e76c5d2a70 100644
--- a/llvm/test/CodeGen/AMDGPU/bswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/bswap.ll
@@ -34,29 +34,29 @@ define amdgpu_kernel void @test_bswap_i32(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: test_bswap_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x10203
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dword s6, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_perm_b32 v0, 0, s2, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: v_perm_b32 v0, 0, s6, v0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_bswap_i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_perm_b32 v0, 0, s2, 0x10203
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: v_perm_b32 v0, 0, s0, 0x10203
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -87,31 +87,31 @@ define amdgpu_kernel void @test_bswap_v2i32(ptr addrspace(1) %out, ptr addrspace
;
; VI-LABEL: test_bswap_v2i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x10203
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_perm_b32 v1, 0, s3, v0
-; VI-NEXT: v_perm_b32 v0, 0, s2, v0
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: v_perm_b32 v1, 0, s7, v0
+; VI-NEXT: v_perm_b32 v0, 0, s6, v0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_bswap_v2i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_perm_b32 v1, 0, s5, 0x10203
-; GFX11-NEXT: v_perm_b32 v0, 0, s4, 0x10203
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: v_perm_b32 v1, 0, s1, 0x10203
+; GFX11-NEXT: v_perm_b32 v0, 0, s0, 0x10203
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -148,35 +148,35 @@ define amdgpu_kernel void @test_bswap_v4i32(ptr addrspace(1) %out, ptr addrspace
;
; VI-LABEL: test_bswap_v4i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x10203
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_perm_b32 v3, 0, s11, v0
; VI-NEXT: v_perm_b32 v2, 0, s10, v0
; VI-NEXT: v_perm_b32 v1, 0, s9, v0
; VI-NEXT: v_perm_b32 v0, 0, s8, v0
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_bswap_v4i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_perm_b32 v3, 0, s7, 0x10203
-; GFX11-NEXT: v_perm_b32 v2, 0, s6, 0x10203
-; GFX11-NEXT: v_perm_b32 v1, 0, s5, 0x10203
-; GFX11-NEXT: v_perm_b32 v0, 0, s4, 0x10203
-; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT: v_perm_b32 v3, 0, s3, 0x10203
+; GFX11-NEXT: v_perm_b32 v2, 0, s2, 0x10203
+; GFX11-NEXT: v_perm_b32 v1, 0, s1, 0x10203
+; GFX11-NEXT: v_perm_b32 v0, 0, s0, 0x10203
+; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -296,31 +296,31 @@ define amdgpu_kernel void @test_bswap_i64(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: test_bswap_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x10203
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_perm_b32 v1, 0, s2, v0
-; VI-NEXT: v_perm_b32 v0, 0, s3, v0
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: v_perm_b32 v1, 0, s6, v0
+; VI-NEXT: v_perm_b32 v0, 0, s7, v0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_bswap_i64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_perm_b32 v1, 0, s4, 0x10203
-; GFX11-NEXT: v_perm_b32 v0, 0, s5, 0x10203
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: v_perm_b32 v1, 0, s0, 0x10203
+; GFX11-NEXT: v_perm_b32 v0, 0, s1, 0x10203
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -357,35 +357,35 @@ define amdgpu_kernel void @test_bswap_v2i64(ptr addrspace(1) %out, ptr addrspace
;
; VI-LABEL: test_bswap_v2i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x10203
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_perm_b32 v3, 0, s10, v0
; VI-NEXT: v_perm_b32 v2, 0, s11, v0
; VI-NEXT: v_perm_b32 v1, 0, s8, v0
; VI-NEXT: v_perm_b32 v0, 0, s9, v0
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_bswap_v2i64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_perm_b32 v3, 0, s6, 0x10203
-; GFX11-NEXT: v_perm_b32 v2, 0, s7, 0x10203
-; GFX11-NEXT: v_perm_b32 v1, 0, s4, 0x10203
-; GFX11-NEXT: v_perm_b32 v0, 0, s5, 0x10203
-; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT: v_perm_b32 v3, 0, s2, 0x10203
+; GFX11-NEXT: v_perm_b32 v2, 0, s3, 0x10203
+; GFX11-NEXT: v_perm_b32 v1, 0, s0, 0x10203
+; GFX11-NEXT: v_perm_b32 v0, 0, s1, 0x10203
+; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll
index 8d347ae421437..04ee81bd8d7cc 100644
--- a/llvm/test/CodeGen/AMDGPU/build_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll
@@ -19,12 +19,12 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) {
;
; GFX8-LABEL: build_vector2:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v0, 5
; GFX8-NEXT: v_mov_b32_e32 v1, 6
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
@@ -52,12 +52,12 @@ define amdgpu_kernel void @build_vector2 (ptr addrspace(1) %out) {
;
; GFX940-LABEL: build_vector2:
; GFX940: ; %bb.0: ; %entry
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: v_mov_b32_e32 v0, 5
; GFX940-NEXT: v_mov_b32_e32 v1, 6
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] sc0 sc1
; GFX940-NEXT: s_endpgm
entry:
store <2 x i32> <i32 5, i32 6>, ptr addrspace(1) %out
@@ -80,14 +80,14 @@ define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) {
;
; GFX8-LABEL: build_vector4:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v0, 5
; GFX8-NEXT: v_mov_b32_e32 v1, 6
; GFX8-NEXT: v_mov_b32_e32 v2, 7
; GFX8-NEXT: v_mov_b32_e32 v3, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
;
@@ -119,14 +119,14 @@ define amdgpu_kernel void @build_vector4 (ptr addrspace(1) %out) {
;
; GFX940-LABEL: build_vector4:
; GFX940: ; %bb.0: ; %entry
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v4, 0
; GFX940-NEXT: v_mov_b32_e32 v0, 5
; GFX940-NEXT: v_mov_b32_e32 v1, 6
; GFX940-NEXT: v_mov_b32_e32 v2, 7
; GFX940-NEXT: v_mov_b32_e32 v3, 8
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] sc0 sc1
; GFX940-NEXT: s_endpgm
entry:
store <4 x i32> <i32 5, i32 6, i32 7, i32 8>, ptr addrspace(1) %out
@@ -146,11 +146,11 @@ define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) {
;
; GFX8-LABEL: build_vector_v2i16:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v2, 0x60005
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -176,11 +176,11 @@ define amdgpu_kernel void @build_vector_v2i16 (ptr addrspace(1) %out) {
;
; GFX940-LABEL: build_vector_v2i16:
; GFX940: ; %bb.0: ; %entry
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v0, 0
; GFX940-NEXT: v_mov_b32_e32 v1, 0x60005
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_store_dword v0, v1, s[0:1] sc0 sc1
+; GFX940-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-NEXT: s_endpgm
entry:
store <2 x i16> <i16 5, i16 6>, ptr addrspace(1) %out
@@ -201,14 +201,14 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32
;
; GFX8-LABEL: build_vector_v2i16_trunc:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_lshr_b32 s2, s2, 16
-; GFX8-NEXT: s_or_b32 s2, s2, 0x50000
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: s_lshr_b32 s0, s4, 16
+; GFX8-NEXT: s_or_b32 s0, s0, 0x50000
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index 00af922b05f93..d5a9607c5e71f 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -1357,29 +1357,29 @@ define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) {
;
; VI-LABEL: amd_kernel_v8i8:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s2, s1, 24
-; VI-NEXT: s_lshr_b32 s3, s1, 16
-; VI-NEXT: s_add_i32 s3, s3, s3
-; VI-NEXT: s_add_i32 s2, s2, s2
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_lshr_b32 s0, s3, 24
+; VI-NEXT: s_lshr_b32 s1, s3, 16
; VI-NEXT: s_add_i32 s1, s1, s1
-; VI-NEXT: v_lshlrev_b16_e64 v2, 8, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: s_add_i32 s0, s0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: s_add_i32 s3, s3, s3
+; VI-NEXT: v_lshlrev_b16_e64 v2, 8, s0
+; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_sdwa v1, vcc, v1, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; VI-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: s_lshr_b32 s4, s0, 24
-; VI-NEXT: s_lshr_b32 s5, s0, 16
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_add_i32 s0, s0, s0
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: s_lshr_b32 s4, s2, 24
+; VI-NEXT: s_lshr_b32 s5, s2, 16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: s_add_i32 s2, s2, s2
; VI-NEXT: v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: s_add_i32 s5, s5, s5
; VI-NEXT: s_add_i32 s4, s4, s4
; VI-NEXT: v_add_u32_sdwa v0, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v2, 8, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
@@ -1392,20 +1392,20 @@ define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) {
;
; GFX11-LABEL: amd_kernel_v8i8:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b16 v0, 8, s0
-; GFX11-NEXT: v_lshrrev_b16 v1, 8, s1
-; GFX11-NEXT: s_lshr_b32 s2, s0, 16
-; GFX11-NEXT: s_lshr_b32 s3, s0, 24
-; GFX11-NEXT: s_lshr_b32 s4, s1, 16
-; GFX11-NEXT: s_lshr_b32 s5, s1, 24
-; GFX11-NEXT: v_add_nc_u16 v2, s1, s1
-; GFX11-NEXT: v_add_nc_u16 v3, s0, s0
+; GFX11-NEXT: v_lshrrev_b16 v0, 8, s2
+; GFX11-NEXT: v_lshrrev_b16 v1, 8, s3
+; GFX11-NEXT: s_lshr_b32 s0, s2, 16
+; GFX11-NEXT: s_lshr_b32 s1, s2, 24
+; GFX11-NEXT: s_lshr_b32 s4, s3, 16
+; GFX11-NEXT: s_lshr_b32 s5, s3, 24
+; GFX11-NEXT: v_add_nc_u16 v2, s3, s3
+; GFX11-NEXT: v_add_nc_u16 v3, s2, s2
; GFX11-NEXT: v_add_nc_u16 v4, s5, s5
; GFX11-NEXT: v_add_nc_u16 v5, s4, s4
-; GFX11-NEXT: v_add_nc_u16 v6, s3, s3
-; GFX11-NEXT: v_add_nc_u16 v7, s2, s2
+; GFX11-NEXT: v_add_nc_u16 v6, s1, s1
+; GFX11-NEXT: v_add_nc_u16 v7, s0, s0
; GFX11-NEXT: v_add_nc_u16 v1, v1, v1
; GFX11-NEXT: v_add_nc_u16 v0, v0, v0
; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
@@ -1524,58 +1524,58 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) {
;
; VI-LABEL: amd_kernel_v16i8:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s4, s3, 24
-; VI-NEXT: s_lshr_b32 s5, s3, 16
-; VI-NEXT: s_add_i32 s5, s5, s5
-; VI-NEXT: s_add_i32 s4, s4, s4
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: s_add_i32 s3, s3, s3
-; VI-NEXT: v_lshlrev_b16_e64 v4, 8, s4
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: s_lshr_b32 s6, s2, 24
-; VI-NEXT: s_lshr_b32 s7, s2, 16
+; VI-NEXT: s_lshr_b32 s0, s7, 24
+; VI-NEXT: s_lshr_b32 s1, s7, 16
+; VI-NEXT: s_add_i32 s1, s1, s1
+; VI-NEXT: s_add_i32 s0, s0, s0
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: s_add_i32 s7, s7, s7
+; VI-NEXT: v_lshlrev_b16_e64 v4, 8, s0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: s_lshr_b32 s2, s6, 24
+; VI-NEXT: s_lshr_b32 s3, s6, 16
; VI-NEXT: v_add_u32_sdwa v3, vcc, v3, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v5, s3
-; VI-NEXT: s_add_i32 s7, s7, s7
-; VI-NEXT: s_add_i32 s6, s6, s6
-; VI-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v5, s7
+; VI-NEXT: s_add_i32 s3, s3, s3
; VI-NEXT: s_add_i32 s2, s2, s2
+; VI-NEXT: v_or_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: s_add_i32 s6, s6, s6
; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b16_e64 v4, 8, s6
-; VI-NEXT: v_mov_b32_e32 v5, s7
-; VI-NEXT: s_lshr_b32 s8, s1, 24
-; VI-NEXT: s_lshr_b32 s9, s1, 16
+; VI-NEXT: v_lshlrev_b16_e64 v4, 8, s2
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: s_lshr_b32 s8, s5, 24
+; VI-NEXT: s_lshr_b32 s9, s5, 16
; VI-NEXT: v_add_u32_sdwa v2, vcc, v2, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v5, s2
+; VI-NEXT: v_mov_b32_e32 v5, s6
; VI-NEXT: s_add_i32 s9, s9, s9
; VI-NEXT: s_add_i32 s8, s8, s8
; VI-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_add_i32 s1, s1, s1
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: s_add_i32 s5, s5, s5
; VI-NEXT: v_or_b32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v4, 8, s8
; VI-NEXT: v_mov_b32_e32 v5, s9
-; VI-NEXT: s_lshr_b32 s10, s0, 24
-; VI-NEXT: s_lshr_b32 s11, s0, 16
+; VI-NEXT: s_lshr_b32 s10, s4, 24
+; VI-NEXT: s_lshr_b32 s11, s4, 16
; VI-NEXT: v_add_u32_sdwa v1, vcc, v1, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; VI-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_add_i32 s11, s11, s11
; VI-NEXT: s_add_i32 s10, s10, s10
; VI-NEXT: v_or_b32_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_add_i32 s0, s0, s0
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: s_add_i32 s4, s4, s4
; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v4, 8, s10
; VI-NEXT: v_mov_b32_e32 v5, s11
; VI-NEXT: v_add_u32_sdwa v0, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; VI-NEXT: v_or_b32_sdwa v6, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v4, 0
; VI-NEXT: v_mov_b32_e32 v5, 0
@@ -1585,36 +1585,36 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) {
;
; GFX11-LABEL: amd_kernel_v16i8:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s10, s3, 16
-; GFX11-NEXT: s_lshr_b32 s11, s3, 24
-; GFX11-NEXT: v_lshrrev_b16 v2, 8, s2
-; GFX11-NEXT: v_lshrrev_b16 v3, 8, s3
+; GFX11-NEXT: s_lshr_b32 s10, s7, 16
+; GFX11-NEXT: s_lshr_b32 s11, s7, 24
+; GFX11-NEXT: v_lshrrev_b16 v2, 8, s6
+; GFX11-NEXT: v_lshrrev_b16 v3, 8, s7
; GFX11-NEXT: v_add_nc_u16 v7, s11, s11
; GFX11-NEXT: v_add_nc_u16 v8, s10, s10
-; GFX11-NEXT: v_add_nc_u16 v4, s3, s3
-; GFX11-NEXT: v_add_nc_u16 v5, s2, s2
+; GFX11-NEXT: v_add_nc_u16 v4, s7, s7
+; GFX11-NEXT: v_add_nc_u16 v5, s6, s6
; GFX11-NEXT: v_add_nc_u16 v3, v3, v3
; GFX11-NEXT: v_add_nc_u16 v2, v2, v2
; GFX11-NEXT: v_lshlrev_b16 v7, 8, v7
; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT: s_lshr_b32 s7, s1, 24
-; GFX11-NEXT: v_lshrrev_b16 v1, 8, s1
-; GFX11-NEXT: v_lshrrev_b16 v0, 8, s0
-; GFX11-NEXT: v_add_nc_u16 v11, s7, s7
+; GFX11-NEXT: s_lshr_b32 s3, s5, 24
+; GFX11-NEXT: v_lshrrev_b16 v1, 8, s5
+; GFX11-NEXT: v_lshrrev_b16 v0, 8, s4
+; GFX11-NEXT: v_add_nc_u16 v11, s3, s3
; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-NEXT: v_and_b32_e32 v5, 0xff, v5
; GFX11-NEXT: v_lshlrev_b16 v3, 8, v3
; GFX11-NEXT: v_lshlrev_b16 v2, 8, v2
; GFX11-NEXT: v_or_b32_e32 v7, v8, v7
-; GFX11-NEXT: s_lshr_b32 s6, s1, 16
-; GFX11-NEXT: s_lshr_b32 s4, s0, 16
-; GFX11-NEXT: s_lshr_b32 s5, s0, 24
-; GFX11-NEXT: s_lshr_b32 s8, s2, 16
-; GFX11-NEXT: s_lshr_b32 s9, s2, 24
-; GFX11-NEXT: v_add_nc_u16 v6, s1, s1
-; GFX11-NEXT: v_add_nc_u16 v12, s6, s6
+; GFX11-NEXT: s_lshr_b32 s2, s5, 16
+; GFX11-NEXT: s_lshr_b32 s0, s4, 16
+; GFX11-NEXT: s_lshr_b32 s1, s4, 24
+; GFX11-NEXT: s_lshr_b32 s8, s6, 16
+; GFX11-NEXT: s_lshr_b32 s9, s6, 24
+; GFX11-NEXT: v_add_nc_u16 v6, s5, s5
+; GFX11-NEXT: v_add_nc_u16 v12, s2, s2
; GFX11-NEXT: v_add_nc_u16 v1, v1, v1
; GFX11-NEXT: v_add_nc_u16 v9, s9, s9
; GFX11-NEXT: v_add_nc_u16 v10, s8, s8
@@ -1622,10 +1622,10 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) {
; GFX11-NEXT: v_or_b32_e32 v2, v5, v2
; GFX11-NEXT: v_lshlrev_b16 v4, 8, v11
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v7
-; GFX11-NEXT: v_add_nc_u16 v7, s0, s0
+; GFX11-NEXT: v_add_nc_u16 v7, s4, s4
; GFX11-NEXT: v_add_nc_u16 v0, v0, v0
-; GFX11-NEXT: v_add_nc_u16 v8, s5, s5
-; GFX11-NEXT: v_add_nc_u16 v11, s4, s4
+; GFX11-NEXT: v_add_nc_u16 v8, s1, s1
+; GFX11-NEXT: v_add_nc_u16 v11, s0, s0
; GFX11-NEXT: v_and_b32_e32 v6, 0xff, v6
; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1
; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12
@@ -1816,112 +1816,112 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
;
; VI-LABEL: amd_kernel_v32i8:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v10, 0
; VI-NEXT: v_mov_b32_e32 v11, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s8, s3, 24
-; VI-NEXT: s_lshr_b32 s9, s3, 16
-; VI-NEXT: s_add_i32 s9, s9, s9
-; VI-NEXT: s_add_i32 s8, s8, s8
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: s_add_i32 s3, s3, s3
-; VI-NEXT: v_lshlrev_b16_e64 v8, 8, s8
-; VI-NEXT: v_mov_b32_e32 v9, s9
-; VI-NEXT: s_lshr_b32 s10, s2, 24
-; VI-NEXT: s_lshr_b32 s11, s2, 16
+; VI-NEXT: s_lshr_b32 s0, s7, 24
+; VI-NEXT: s_lshr_b32 s1, s7, 16
+; VI-NEXT: s_add_i32 s1, s1, s1
+; VI-NEXT: s_add_i32 s0, s0, s0
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: s_add_i32 s7, s7, s7
+; VI-NEXT: v_lshlrev_b16_e64 v8, 8, s0
+; VI-NEXT: v_mov_b32_e32 v9, s1
+; VI-NEXT: s_lshr_b32 s2, s6, 24
+; VI-NEXT: s_lshr_b32 s3, s6, 16
; VI-NEXT: v_add_u32_sdwa v3, vcc, v3, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v9, s3
-; VI-NEXT: s_add_i32 s11, s11, s11
-; VI-NEXT: s_add_i32 s10, s10, s10
-; VI-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v9, s7
+; VI-NEXT: s_add_i32 s3, s3, s3
; VI-NEXT: s_add_i32 s2, s2, s2
+; VI-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: s_add_i32 s6, s6, s6
; VI-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_lshlrev_b16_e64 v8, 8, s10
-; VI-NEXT: v_mov_b32_e32 v9, s11
-; VI-NEXT: s_lshr_b32 s12, s1, 24
-; VI-NEXT: s_lshr_b32 s13, s1, 16
+; VI-NEXT: v_lshlrev_b16_e64 v8, 8, s2
+; VI-NEXT: v_mov_b32_e32 v9, s3
+; VI-NEXT: s_lshr_b32 s12, s5, 24
+; VI-NEXT: s_lshr_b32 s13, s5, 16
; VI-NEXT: v_add_u32_sdwa v2, vcc, v2, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v9, s2
-; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v9, s6
+; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: s_add_i32 s13, s13, s13
; VI-NEXT: s_add_i32 s12, s12, s12
; VI-NEXT: v_or_b32_sdwa v2, v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_sdwa v4, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; VI-NEXT: v_mov_b32_e32 v0, s5
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_add_i32 s1, s1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s9
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: s_add_i32 s5, s5, s5
; VI-NEXT: v_or_b32_sdwa v2, v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v8, 8, s12
; VI-NEXT: v_mov_b32_e32 v9, s13
-; VI-NEXT: s_lshr_b32 s14, s0, 24
-; VI-NEXT: s_lshr_b32 s15, s0, 16
+; VI-NEXT: s_lshr_b32 s14, s4, 24
+; VI-NEXT: s_lshr_b32 s15, s4, 16
; VI-NEXT: v_add_u32_sdwa v5, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v0, s10
; VI-NEXT: v_add_u32_sdwa v1, vcc, v1, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v9, s1
+; VI-NEXT: v_mov_b32_e32 v9, s5
; VI-NEXT: v_add_u32_sdwa v6, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; VI-NEXT: v_mov_b32_e32 v0, s7
+; VI-NEXT: v_mov_b32_e32 v0, s11
; VI-NEXT: s_add_i32 s15, s15, s15
; VI-NEXT: s_add_i32 s14, s14, s14
; VI-NEXT: v_or_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_add_u32_sdwa v7, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_add_i32 s0, s0, s0
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: s_add_i32 s4, s4, s4
; VI-NEXT: v_or_b32_sdwa v1, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v8, 8, s14
; VI-NEXT: v_mov_b32_e32 v9, s15
-; VI-NEXT: s_lshr_b32 s16, s7, 24
-; VI-NEXT: s_lshr_b32 s17, s7, 16
+; VI-NEXT: s_lshr_b32 s16, s11, 24
+; VI-NEXT: s_lshr_b32 s17, s11, 16
; VI-NEXT: v_add_u32_sdwa v0, vcc, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v9, s0
+; VI-NEXT: v_mov_b32_e32 v9, s4
; VI-NEXT: s_add_i32 s17, s17, s17
; VI-NEXT: s_add_i32 s16, s16, s16
; VI-NEXT: v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_add_i32 s7, s7, s7
+; VI-NEXT: s_add_i32 s11, s11, s11
; VI-NEXT: v_or_b32_sdwa v0, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v8, 8, s16
; VI-NEXT: v_mov_b32_e32 v9, s17
-; VI-NEXT: s_lshr_b32 s18, s6, 24
-; VI-NEXT: s_lshr_b32 s19, s6, 16
+; VI-NEXT: s_lshr_b32 s18, s10, 24
+; VI-NEXT: s_lshr_b32 s19, s10, 16
; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v9, s7
+; VI-NEXT: v_mov_b32_e32 v9, s11
; VI-NEXT: s_add_i32 s19, s19, s19
; VI-NEXT: s_add_i32 s18, s18, s18
; VI-NEXT: v_or_b32_sdwa v7, v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_add_i32 s6, s6, s6
+; VI-NEXT: s_add_i32 s10, s10, s10
; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v8, 8, s18
; VI-NEXT: v_mov_b32_e32 v9, s19
-; VI-NEXT: s_lshr_b32 s20, s5, 24
-; VI-NEXT: s_lshr_b32 s21, s5, 16
+; VI-NEXT: s_lshr_b32 s20, s9, 24
+; VI-NEXT: s_lshr_b32 s21, s9, 16
; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v9, s6
+; VI-NEXT: v_mov_b32_e32 v9, s10
; VI-NEXT: s_add_i32 s21, s21, s21
; VI-NEXT: s_add_i32 s20, s20, s20
; VI-NEXT: v_or_b32_sdwa v6, v9, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_add_i32 s5, s5, s5
+; VI-NEXT: s_add_i32 s9, s9, s9
; VI-NEXT: v_or_b32_sdwa v6, v6, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v8, 8, s20
; VI-NEXT: v_mov_b32_e32 v9, s21
-; VI-NEXT: s_lshr_b32 s22, s4, 24
-; VI-NEXT: s_lshr_b32 s23, s4, 16
+; VI-NEXT: s_lshr_b32 s22, s8, 24
+; VI-NEXT: s_lshr_b32 s23, s8, 16
; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v9, s5
+; VI-NEXT: v_mov_b32_e32 v9, s9
; VI-NEXT: s_add_i32 s23, s23, s23
; VI-NEXT: s_add_i32 s22, s22, s22
; VI-NEXT: v_or_b32_sdwa v5, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: s_add_i32 s4, s4, s4
+; VI-NEXT: s_add_i32 s8, s8, s8
; VI-NEXT: v_or_b32_sdwa v5, v5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e64 v8, 8, s22
; VI-NEXT: v_mov_b32_e32 v9, s23
; VI-NEXT: v_or_b32_sdwa v8, v9, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v9, s4
+; VI-NEXT: v_mov_b32_e32 v9, s8
; VI-NEXT: v_or_b32_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v4, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v8, 16
@@ -1932,39 +1932,39 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
;
; GFX11-LABEL: amd_kernel_v32i8:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b16 v3, 8, s2
-; GFX11-NEXT: v_lshrrev_b16 v7, 8, s3
-; GFX11-NEXT: s_lshr_b32 s21, s3, 16
-; GFX11-NEXT: s_lshr_b32 s22, s3, 24
-; GFX11-NEXT: v_add_nc_u16 v8, s3, s3
-; GFX11-NEXT: v_add_nc_u16 v9, s2, s2
+; GFX11-NEXT: v_lshrrev_b16 v3, 8, s6
+; GFX11-NEXT: v_lshrrev_b16 v7, 8, s7
+; GFX11-NEXT: s_lshr_b32 s21, s7, 16
+; GFX11-NEXT: s_lshr_b32 s22, s7, 24
+; GFX11-NEXT: v_add_nc_u16 v8, s7, s7
+; GFX11-NEXT: v_add_nc_u16 v9, s6, s6
; GFX11-NEXT: v_add_nc_u16 v7, v7, v7
; GFX11-NEXT: v_add_nc_u16 v10, s22, s22
; GFX11-NEXT: v_add_nc_u16 v11, s21, s21
; GFX11-NEXT: v_add_nc_u16 v3, v3, v3
-; GFX11-NEXT: v_lshrrev_b16 v2, 8, s1
+; GFX11-NEXT: v_lshrrev_b16 v2, 8, s5
; GFX11-NEXT: v_and_b32_e32 v8, 0xff, v8
; GFX11-NEXT: v_lshlrev_b16 v7, 8, v7
; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9
; GFX11-NEXT: v_lshlrev_b16 v10, 8, v10
; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11
; GFX11-NEXT: v_lshlrev_b16 v3, 8, v3
-; GFX11-NEXT: s_lshr_b32 s18, s1, 16
-; GFX11-NEXT: s_lshr_b32 s19, s1, 24
-; GFX11-NEXT: s_lshr_b32 s20, s2, 24
-; GFX11-NEXT: s_lshr_b32 s2, s2, 16
+; GFX11-NEXT: s_lshr_b32 s18, s5, 16
+; GFX11-NEXT: s_lshr_b32 s19, s5, 24
+; GFX11-NEXT: s_lshr_b32 s20, s6, 24
+; GFX11-NEXT: s_lshr_b32 s6, s6, 16
; GFX11-NEXT: v_or_b32_e32 v7, v8, v7
; GFX11-NEXT: v_add_nc_u16 v8, s20, s20
; GFX11-NEXT: v_or_b32_e32 v10, v11, v10
; GFX11-NEXT: v_or_b32_e32 v3, v9, v3
-; GFX11-NEXT: v_add_nc_u16 v9, s2, s2
-; GFX11-NEXT: v_add_nc_u16 v11, s1, s1
+; GFX11-NEXT: v_add_nc_u16 v9, s6, s6
+; GFX11-NEXT: v_add_nc_u16 v11, s5, s5
; GFX11-NEXT: v_add_nc_u16 v2, v2, v2
; GFX11-NEXT: v_add_nc_u16 v12, s19, s19
; GFX11-NEXT: v_add_nc_u16 v13, s18, s18
-; GFX11-NEXT: v_lshrrev_b16 v1, 8, s0
+; GFX11-NEXT: v_lshrrev_b16 v1, 8, s4
; GFX11-NEXT: v_lshlrev_b16 v8, 8, v8
; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9
; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11
@@ -1974,10 +1974,10 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
; GFX11-NEXT: v_and_b32_e32 v14, 0xffff, v3
; GFX11-NEXT: v_or_b32_e32 v3, v9, v8
; GFX11-NEXT: v_or_b32_e32 v2, v11, v2
-; GFX11-NEXT: v_add_nc_u16 v9, s0, s0
+; GFX11-NEXT: v_add_nc_u16 v9, s4, s4
; GFX11-NEXT: v_or_b32_e32 v8, v13, v12
; GFX11-NEXT: v_add_nc_u16 v1, v1, v1
-; GFX11-NEXT: v_lshrrev_b16 v6, 8, s7
+; GFX11-NEXT: v_lshrrev_b16 v6, 8, s11
; GFX11-NEXT: v_and_b32_e32 v7, 0xffff, v7
; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GFX11-NEXT: v_lshlrev_b32_e32 v11, 16, v3
@@ -1985,14 +1985,14 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX11-NEXT: v_and_b32_e32 v9, 0xff, v9
; GFX11-NEXT: v_lshlrev_b16 v13, 8, v1
-; GFX11-NEXT: v_lshrrev_b16 v5, 8, s6
-; GFX11-NEXT: s_lshr_b32 s14, s7, 16
-; GFX11-NEXT: s_lshr_b32 s15, s7, 24
-; GFX11-NEXT: s_lshr_b32 s16, s0, 16
-; GFX11-NEXT: s_lshr_b32 s17, s0, 24
+; GFX11-NEXT: v_lshrrev_b16 v5, 8, s10
+; GFX11-NEXT: s_lshr_b32 s14, s11, 16
+; GFX11-NEXT: s_lshr_b32 s15, s11, 24
+; GFX11-NEXT: s_lshr_b32 s16, s4, 16
+; GFX11-NEXT: s_lshr_b32 s17, s4, 24
; GFX11-NEXT: v_or_b32_e32 v3, v7, v10
; GFX11-NEXT: v_or_b32_e32 v2, v14, v11
-; GFX11-NEXT: v_add_nc_u16 v7, s7, s7
+; GFX11-NEXT: v_add_nc_u16 v7, s11, s11
; GFX11-NEXT: v_or_b32_e32 v1, v12, v8
; GFX11-NEXT: v_or_b32_e32 v8, v9, v13
; GFX11-NEXT: v_add_nc_u16 v9, s17, s17
@@ -2000,7 +2000,7 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
; GFX11-NEXT: v_add_nc_u16 v6, v6, v6
; GFX11-NEXT: v_add_nc_u16 v11, s15, s15
; GFX11-NEXT: v_add_nc_u16 v12, s14, s14
-; GFX11-NEXT: v_add_nc_u16 v13, s6, s6
+; GFX11-NEXT: v_add_nc_u16 v13, s10, s10
; GFX11-NEXT: v_add_nc_u16 v5, v5, v5
; GFX11-NEXT: v_and_b32_e32 v7, 0xff, v7
; GFX11-NEXT: v_lshlrev_b16 v6, 8, v6
@@ -2008,16 +2008,16 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
; GFX11-NEXT: v_and_b32_e32 v12, 0xff, v12
; GFX11-NEXT: v_lshlrev_b16 v9, 8, v9
; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10
-; GFX11-NEXT: v_lshrrev_b16 v0, 8, s4
-; GFX11-NEXT: v_lshrrev_b16 v4, 8, s5
+; GFX11-NEXT: v_lshrrev_b16 v0, 8, s8
+; GFX11-NEXT: v_lshrrev_b16 v4, 8, s9
; GFX11-NEXT: v_and_b32_e32 v13, 0xff, v13
; GFX11-NEXT: v_lshlrev_b16 v5, 8, v5
-; GFX11-NEXT: s_lshr_b32 s12, s6, 16
-; GFX11-NEXT: s_lshr_b32 s13, s6, 24
-; GFX11-NEXT: s_lshr_b32 s8, s4, 16
-; GFX11-NEXT: s_lshr_b32 s9, s4, 24
-; GFX11-NEXT: s_lshr_b32 s10, s5, 16
-; GFX11-NEXT: s_lshr_b32 s11, s5, 24
+; GFX11-NEXT: s_lshr_b32 s12, s10, 16
+; GFX11-NEXT: s_lshr_b32 s13, s10, 24
+; GFX11-NEXT: s_lshr_b32 s0, s8, 16
+; GFX11-NEXT: s_lshr_b32 s1, s8, 24
+; GFX11-NEXT: s_lshr_b32 s2, s9, 16
+; GFX11-NEXT: s_lshr_b32 s3, s9, 24
; GFX11-NEXT: v_or_b32_e32 v6, v7, v6
; GFX11-NEXT: v_or_b32_e32 v7, v12, v11
; GFX11-NEXT: v_and_b32_e32 v12, 0xffff, v8
@@ -2025,14 +2025,14 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
; GFX11-NEXT: v_add_nc_u16 v9, s13, s13
; GFX11-NEXT: v_add_nc_u16 v10, s12, s12
; GFX11-NEXT: v_or_b32_e32 v5, v13, v5
-; GFX11-NEXT: v_add_nc_u16 v11, s5, s5
+; GFX11-NEXT: v_add_nc_u16 v11, s9, s9
; GFX11-NEXT: v_add_nc_u16 v4, v4, v4
-; GFX11-NEXT: v_add_nc_u16 v13, s11, s11
-; GFX11-NEXT: v_add_nc_u16 v14, s10, s10
-; GFX11-NEXT: v_add_nc_u16 v15, s4, s4
+; GFX11-NEXT: v_add_nc_u16 v13, s3, s3
+; GFX11-NEXT: v_add_nc_u16 v14, s2, s2
+; GFX11-NEXT: v_add_nc_u16 v15, s8, s8
; GFX11-NEXT: v_add_nc_u16 v0, v0, v0
-; GFX11-NEXT: v_add_nc_u16 v16, s9, s9
-; GFX11-NEXT: v_add_nc_u16 v17, s8, s8
+; GFX11-NEXT: v_add_nc_u16 v16, s1, s1
+; GFX11-NEXT: v_add_nc_u16 v17, s0, s0
; GFX11-NEXT: v_lshlrev_b16 v9, 8, v9
; GFX11-NEXT: v_and_b32_e32 v10, 0xff, v10
; GFX11-NEXT: v_and_b32_e32 v11, 0xff, v11
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index a0499ef6d0f6a..8ad4535de9462 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -35,11 +35,11 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
; VI-LABEL: sadd64rr:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: s_add_u32 s0, s6, s0
-; VI-NEXT: s_addc_u32 s1, s7, s1
+; VI-NEXT: s_add_u32 s0, s6, s2
+; VI-NEXT: s_addc_u32 s1, s7, s3
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -77,11 +77,11 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX1030W32: ; %bb.0: ; %entry
; GFX1030W32-NEXT: s_clause 0x1
; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: s_add_u32 s0, s6, s0
-; GFX1030W32-NEXT: s_addc_u32 s1, s7, s1
+; GFX1030W32-NEXT: s_add_u32 s0, s6, s2
+; GFX1030W32-NEXT: s_addc_u32 s1, s7, s3
; GFX1030W32-NEXT: v_mov_b32_e32 v0, s0
; GFX1030W32-NEXT: v_mov_b32_e32 v1, s1
; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
@@ -91,11 +91,11 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX1030W64: ; %bb.0: ; %entry
; GFX1030W64-NEXT: s_clause 0x1
; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: s_add_u32 s0, s6, s0
-; GFX1030W64-NEXT: s_addc_u32 s1, s7, s1
+; GFX1030W64-NEXT: s_add_u32 s0, s6, s2
+; GFX1030W64-NEXT: s_addc_u32 s1, s7, s3
; GFX1030W64-NEXT: v_mov_b32_e32 v0, s0
; GFX1030W64-NEXT: v_mov_b32_e32 v1, s1
; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
@@ -105,10 +105,10 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s0, s6, s0
-; GFX11-NEXT: s_addc_u32 s1, s7, s1
+; GFX11-NEXT: s_add_u32 s0, s6, s2
+; GFX11-NEXT: s_addc_u32 s1, s7, s3
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
@@ -144,74 +144,74 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) {
;
; VI-LABEL: sadd64ri:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_add_u32 s0, s2, 0x56789876
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_addc_u32 s1, s3, 0x1234
+; VI-NEXT: s_add_u32 s0, s6, 0x56789876
+; VI-NEXT: s_addc_u32 s1, s7, 0x1234
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: sadd64ri:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s2, s2, 0x56789876
-; GFX9-NEXT: s_addc_u32 s3, s3, 0x1234
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_add_u32 s0, s6, 0x56789876
+; GFX9-NEXT: s_addc_u32 s1, s7, 0x1234
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: sadd64ri:
; GFX1010: ; %bb.0: ; %entry
-; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT: s_add_u32 s2, s2, 0x56789876
-; GFX1010-NEXT: s_addc_u32 s3, s3, 0x1234
-; GFX1010-NEXT: v_mov_b32_e32 v0, s2
-; GFX1010-NEXT: v_mov_b32_e32 v1, s3
-; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-NEXT: s_add_u32 s0, s6, 0x56789876
+; GFX1010-NEXT: s_addc_u32 s1, s7, 0x1234
+; GFX1010-NEXT: v_mov_b32_e32 v0, s0
+; GFX1010-NEXT: v_mov_b32_e32 v1, s1
+; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: sadd64ri:
; GFX1030W32: ; %bb.0: ; %entry
-; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: s_add_u32 s2, s2, 0x56789876
-; GFX1030W32-NEXT: s_addc_u32 s3, s3, 0x1234
-; GFX1030W32-NEXT: v_mov_b32_e32 v0, s2
-; GFX1030W32-NEXT: v_mov_b32_e32 v1, s3
-; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W32-NEXT: s_add_u32 s0, s6, 0x56789876
+; GFX1030W32-NEXT: s_addc_u32 s1, s7, 0x1234
+; GFX1030W32-NEXT: v_mov_b32_e32 v0, s0
+; GFX1030W32-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: sadd64ri:
; GFX1030W64: ; %bb.0: ; %entry
-; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: s_add_u32 s2, s2, 0x56789876
-; GFX1030W64-NEXT: s_addc_u32 s3, s3, 0x1234
-; GFX1030W64-NEXT: v_mov_b32_e32 v0, s2
-; GFX1030W64-NEXT: v_mov_b32_e32 v1, s3
-; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W64-NEXT: s_add_u32 s0, s6, 0x56789876
+; GFX1030W64-NEXT: s_addc_u32 s1, s7, 0x1234
+; GFX1030W64-NEXT: v_mov_b32_e32 v0, s0
+; GFX1030W64-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: sadd64ri:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s2, s2, 0x56789876
-; GFX11-NEXT: s_addc_u32 s3, s3, 0x1234
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_add_u32 s0, s6, 0x56789876
+; GFX11-NEXT: s_addc_u32 s1, s7, 0x1234
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -243,66 +243,66 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) {
;
; VI-LABEL: vadd64rr:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v4, s3
-; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v1, s0
-; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v4, s7
+; VI-NEXT: v_add_u32_e32 v3, vcc, s6, v0
+; VI-NEXT: v_mov_b32_e32 v1, s4
+; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
; VI-NEXT: flat_store_dwordx2 v[1:2], v[3:4]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: vadd64rr:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: vadd64rr:
; GFX1010: ; %bb.0: ; %entry
-; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT: v_add_co_u32 v0, s2, s2, v0
-; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2
-; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-NEXT: v_add_co_u32 v0, s0, s6, v0
+; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s0, s7, 0, s0
+; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: vadd64rr:
; GFX1030W32: ; %bb.0: ; %entry
-; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: v_add_co_u32 v0, s2, s2, v0
-; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2
-; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W32-NEXT: v_add_co_u32 v0, s0, s6, v0
+; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s0
+; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: vadd64rr:
; GFX1030W64: ; %bb.0: ; %entry
-; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: v_add_co_u32 v0, s[4:5], s2, v0
-; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s[4:5]
-; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W64-NEXT: v_add_co_u32 v0, s[0:1], s6, v0
+; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s[0:1]
+; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: vadd64rr:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v0, s2, s2, v0
+; GFX11-NEXT: v_add_co_u32 v0, s0, s6, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s0
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -334,65 +334,66 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) {
;
; VI-LABEL: vadd64ri:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_add_u32_e32 v0, vcc, 0x56789876, v0
; VI-NEXT: v_mov_b32_e32 v1, 0x1234
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: vadd64ri:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x56789876, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x1234
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: vadd64ri:
; GFX1010: ; %bb.0: ; %entry
-; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1010-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0
+; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1010-NEXT: s_mov_b32 null, 0
+; GFX1010-NEXT: v_add_co_u32 v0, s0, 0x56789876, v0
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
-; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s2, 0, 0x1234, s2
+; GFX1010-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, 0x1234, s0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: vadd64ri:
; GFX1030W32: ; %bb.0: ; %entry
-; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1030W32-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0
+; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1030W32-NEXT: v_add_co_u32 v0, s0, 0x56789876, v0
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
-; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s2
+; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: vadd64ri:
; GFX1030W64: ; %bb.0: ; %entry
-; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1030W64-NEXT: v_add_co_u32 v0, s[2:3], 0x56789876, v0
+; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1030W64-NEXT: v_add_co_u32 v0, s[0:1], 0x56789876, v0
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
-; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s[2:3]
+; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s[0:1]
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: vadd64ri:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: v_add_co_u32 v0, s2, 0x56789876, v0
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: v_add_co_u32 v0, s0, 0x56789876, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s2
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, 0x1234, s0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -424,12 +425,12 @@ define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %car
; VI-LABEL: suaddo32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_i32 s2, s2, s3
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_add_i32 s0, s2, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -460,36 +461,36 @@ define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W32: ; %bb.0:
; GFX1030W32-NEXT: s_clause 0x1
; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v0, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: s_add_i32 s2, s2, s3
-; GFX1030W32-NEXT: v_mov_b32_e32 v1, s2
-; GFX1030W32-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1030W32-NEXT: s_add_i32 s0, s2, s3
+; GFX1030W32-NEXT: v_mov_b32_e32 v1, s0
+; GFX1030W32-NEXT: global_store_dword v0, v1, s[4:5]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: suaddo32:
; GFX1030W64: ; %bb.0:
; GFX1030W64-NEXT: s_clause 0x1
; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v0, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: s_add_i32 s2, s2, s3
-; GFX1030W64-NEXT: v_mov_b32_e32 v1, s2
-; GFX1030W64-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1030W64-NEXT: s_add_i32 s0, s2, s3
+; GFX1030W64-NEXT: v_mov_b32_e32 v1, s0
+; GFX1030W64-NEXT: global_store_dword v0, v1, s[4:5]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: suaddo32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_i32 s2, s2, s3
+; GFX11-NEXT: s_add_i32 s0, s2, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -534,12 +535,12 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
; VI-LABEL: uaddo32_vcc_user:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: v_mov_b32_e32 v4, s3
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
+; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v4
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
@@ -576,42 +577,42 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
; GFX1030W32-LABEL: uaddo32_vcc_user:
; GFX1030W32: ; %bb.0:
; GFX1030W32-NEXT: s_clause 0x1
-; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v0, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: v_add_co_u32 v1, s4, s4, s5
-; GFX1030W32-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
-; GFX1030W32-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX1030W32-NEXT: global_store_byte v0, v2, s[2:3]
+; GFX1030W32-NEXT: v_add_co_u32 v1, s0, s2, s3
+; GFX1030W32-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX1030W32-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX1030W32-NEXT: global_store_byte v0, v2, s[6:7]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: uaddo32_vcc_user:
; GFX1030W64: ; %bb.0:
; GFX1030W64-NEXT: s_clause 0x1
-; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v0, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: v_add_co_u32 v1, s[4:5], s4, s5
-; GFX1030W64-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5]
-; GFX1030W64-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX1030W64-NEXT: global_store_byte v0, v2, s[2:3]
+; GFX1030W64-NEXT: v_add_co_u32 v1, s[0:1], s2, s3
+; GFX1030W64-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX1030W64-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX1030W64-NEXT: global_store_byte v0, v2, s[6:7]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: uaddo32_vcc_user:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v1, s4, s4, s5
+; GFX11-NEXT: v_add_co_u32 v1, s0, s2, s3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-NEXT: global_store_b8 v0, v2, s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -655,19 +656,19 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
;
; VI-LABEL: suaddo64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_add_u32 s0, s4, s6
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_addc_u32 s1, s5, s7
-; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: s_add_u32 s0, s8, s10
+; VI-NEXT: v_mov_b32_e32 v4, s8
+; VI-NEXT: s_addc_u32 s1, s9, s11
+; VI-NEXT: v_mov_b32_e32 v5, s9
; VI-NEXT: v_mov_b32_e32 v7, s1
; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5]
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v6, s0
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx2 v[0:1], v[6:7]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; VI-NEXT: flat_store_byte v[2:3], v0
@@ -675,80 +676,80 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
;
; GFX9-LABEL: suaddo64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s6, s4, s6
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_addc_u32 s7, s5, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: s_add_u32 s0, s8, s10
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s9, s11
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
-; GFX9-NEXT: global_store_byte v4, v0, s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
+; GFX9-NEXT: global_store_byte v4, v0, s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: suaddo64:
; GFX1010: ; %bb.0:
-; GFX1010-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX1010-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT: s_add_u32 s6, s4, s6
-; GFX1010-NEXT: s_addc_u32 s7, s5, s7
-; GFX1010-NEXT: v_mov_b32_e32 v0, s6
-; GFX1010-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[4:5]
-; GFX1010-NEXT: v_mov_b32_e32 v1, s7
-; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
-; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1010-NEXT: global_store_byte v2, v3, s[2:3]
+; GFX1010-NEXT: s_add_u32 s0, s8, s10
+; GFX1010-NEXT: s_addc_u32 s1, s9, s11
+; GFX1010-NEXT: v_mov_b32_e32 v0, s0
+; GFX1010-NEXT: v_mov_b32_e32 v1, s1
+; GFX1010-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[8:9]
+; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1010-NEXT: global_store_byte v2, v3, s[6:7]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: suaddo64:
; GFX1030W32: ; %bb.0:
-; GFX1030W32-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX1030W32-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: s_add_u32 s6, s4, s6
-; GFX1030W32-NEXT: s_addc_u32 s7, s5, s7
-; GFX1030W32-NEXT: v_mov_b32_e32 v0, s6
-; GFX1030W32-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[4:5]
-; GFX1030W32-NEXT: v_mov_b32_e32 v1, s7
-; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
-; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1030W32-NEXT: global_store_byte v2, v3, s[2:3]
+; GFX1030W32-NEXT: s_add_u32 s0, s8, s10
+; GFX1030W32-NEXT: s_addc_u32 s1, s9, s11
+; GFX1030W32-NEXT: v_mov_b32_e32 v0, s0
+; GFX1030W32-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030W32-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[8:9]
+; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1030W32-NEXT: global_store_byte v2, v3, s[6:7]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: suaddo64:
; GFX1030W64: ; %bb.0:
-; GFX1030W64-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX1030W64-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: s_add_u32 s6, s4, s6
-; GFX1030W64-NEXT: s_addc_u32 s7, s5, s7
-; GFX1030W64-NEXT: v_mov_b32_e32 v0, s6
-; GFX1030W64-NEXT: v_cmp_lt_u64_e64 s[4:5], s[6:7], s[4:5]
-; GFX1030W64-NEXT: v_mov_b32_e32 v1, s7
-; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
-; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1030W64-NEXT: global_store_byte v2, v3, s[2:3]
+; GFX1030W64-NEXT: s_add_u32 s0, s8, s10
+; GFX1030W64-NEXT: s_addc_u32 s1, s9, s11
+; GFX1030W64-NEXT: v_mov_b32_e32 v0, s0
+; GFX1030W64-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030W64-NEXT: v_cmp_lt_u64_e64 s[0:1], s[0:1], s[8:9]
+; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1030W64-NEXT: global_store_byte v2, v3, s[6:7]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: suaddo64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s6, s4, s6
-; GFX11-NEXT: s_addc_u32 s7, s5, s7
-; GFX11-NEXT: v_mov_b32_e32 v0, s6
-; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[4:5]
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX11-NEXT: s_add_u32 s0, s8, s10
+; GFX11-NEXT: s_addc_u32 s1, s9, s11
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[8:9]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT: global_store_b8 v2, v3, s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -792,13 +793,13 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; VI-LABEL: vuaddo64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s4
-; VI-NEXT: v_mov_b32_e32 v6, s1
-; VI-NEXT: v_add_u32_e32 v5, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v6, s3
+; VI-NEXT: v_add_u32_e32 v5, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
-; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[5:6]
+; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[5:6]
; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: v_mov_b32_e32 v3, s6
; VI-NEXT: v_mov_b32_e32 v4, s7
@@ -840,48 +841,48 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W32-LABEL: vuaddo64:
; GFX1030W32: ; %bb.0:
; GFX1030W32-NEXT: s_clause 0x1
-; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: v_add_co_u32 v0, s6, s4, v0
-; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s6
-; GFX1030W32-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX1030W32-NEXT: v_add_co_u32 v0, s0, s2, v0
+; GFX1030W32-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
+; GFX1030W32-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
-; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1030W32-NEXT: global_store_byte v2, v3, s[2:3]
+; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1030W32-NEXT: global_store_byte v2, v3, s[6:7]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: vuaddo64:
; GFX1030W64: ; %bb.0:
; GFX1030W64-NEXT: s_clause 0x1
-; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: v_add_co_u32 v0, s[6:7], s4, v0
-; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s[6:7]
-; GFX1030W64-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
+; GFX1030W64-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
+; GFX1030W64-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s[0:1]
+; GFX1030W64-NEXT: v_cmp_gt_u64_e32 vcc, s[2:3], v[0:1]
; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1030W64-NEXT: global_store_byte v2, v3, s[2:3]
+; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1030W64-NEXT: global_store_byte v2, v3, s[6:7]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: vuaddo64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v0, s6, s4, v0
+; GFX11-NEXT: v_add_co_u32 v0, s0, s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s6
-; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
+; GFX11-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[2:3], v[0:1]
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT: global_store_b8 v2, v3, s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -920,11 +921,11 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
; VI-LABEL: ssub64rr:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: s_sub_u32 s0, s6, s0
-; VI-NEXT: s_subb_u32 s1, s7, s1
+; VI-NEXT: s_sub_u32 s0, s6, s2
+; VI-NEXT: s_subb_u32 s1, s7, s3
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -962,11 +963,11 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX1030W32: ; %bb.0: ; %entry
; GFX1030W32-NEXT: s_clause 0x1
; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: s_sub_u32 s0, s6, s0
-; GFX1030W32-NEXT: s_subb_u32 s1, s7, s1
+; GFX1030W32-NEXT: s_sub_u32 s0, s6, s2
+; GFX1030W32-NEXT: s_subb_u32 s1, s7, s3
; GFX1030W32-NEXT: v_mov_b32_e32 v0, s0
; GFX1030W32-NEXT: v_mov_b32_e32 v1, s1
; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
@@ -976,11 +977,11 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX1030W64: ; %bb.0: ; %entry
; GFX1030W64-NEXT: s_clause 0x1
; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: s_sub_u32 s0, s6, s0
-; GFX1030W64-NEXT: s_subb_u32 s1, s7, s1
+; GFX1030W64-NEXT: s_sub_u32 s0, s6, s2
+; GFX1030W64-NEXT: s_subb_u32 s1, s7, s3
; GFX1030W64-NEXT: v_mov_b32_e32 v0, s0
; GFX1030W64-NEXT: v_mov_b32_e32 v1, s1
; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
@@ -990,10 +991,10 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sub_u32 s0, s6, s0
-; GFX11-NEXT: s_subb_u32 s1, s7, s1
+; GFX11-NEXT: s_sub_u32 s0, s6, s2
+; GFX11-NEXT: s_subb_u32 s1, s7, s3
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
@@ -1029,74 +1030,74 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) {
;
; VI-LABEL: ssub64ri:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_sub_u32 s0, 0x56789876, s2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_subb_u32 s1, 0x1234, s3
+; VI-NEXT: s_sub_u32 s0, 0x56789876, s6
+; VI-NEXT: s_subb_u32 s1, 0x1234, s7
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: ssub64ri:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sub_u32 s2, 0x56789876, s2
-; GFX9-NEXT: s_subb_u32 s3, 0x1234, s3
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: s_sub_u32 s0, 0x56789876, s6
+; GFX9-NEXT: s_subb_u32 s1, 0x1234, s7
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: ssub64ri:
; GFX1010: ; %bb.0: ; %entry
-; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT: s_sub_u32 s2, 0x56789876, s2
-; GFX1010-NEXT: s_subb_u32 s3, 0x1234, s3
-; GFX1010-NEXT: v_mov_b32_e32 v0, s2
-; GFX1010-NEXT: v_mov_b32_e32 v1, s3
-; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-NEXT: s_sub_u32 s0, 0x56789876, s6
+; GFX1010-NEXT: s_subb_u32 s1, 0x1234, s7
+; GFX1010-NEXT: v_mov_b32_e32 v0, s0
+; GFX1010-NEXT: v_mov_b32_e32 v1, s1
+; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: ssub64ri:
; GFX1030W32: ; %bb.0: ; %entry
-; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: s_sub_u32 s2, 0x56789876, s2
-; GFX1030W32-NEXT: s_subb_u32 s3, 0x1234, s3
-; GFX1030W32-NEXT: v_mov_b32_e32 v0, s2
-; GFX1030W32-NEXT: v_mov_b32_e32 v1, s3
-; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W32-NEXT: s_sub_u32 s0, 0x56789876, s6
+; GFX1030W32-NEXT: s_subb_u32 s1, 0x1234, s7
+; GFX1030W32-NEXT: v_mov_b32_e32 v0, s0
+; GFX1030W32-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: ssub64ri:
; GFX1030W64: ; %bb.0: ; %entry
-; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: s_sub_u32 s2, 0x56789876, s2
-; GFX1030W64-NEXT: s_subb_u32 s3, 0x1234, s3
-; GFX1030W64-NEXT: v_mov_b32_e32 v0, s2
-; GFX1030W64-NEXT: v_mov_b32_e32 v1, s3
-; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W64-NEXT: s_sub_u32 s0, 0x56789876, s6
+; GFX1030W64-NEXT: s_subb_u32 s1, 0x1234, s7
+; GFX1030W64-NEXT: v_mov_b32_e32 v0, s0
+; GFX1030W64-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: ssub64ri:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sub_u32 s2, 0x56789876, s2
-; GFX11-NEXT: s_subb_u32 s3, 0x1234, s3
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_sub_u32 s0, 0x56789876, s6
+; GFX11-NEXT: s_subb_u32 s1, 0x1234, s7
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1128,66 +1129,66 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) {
;
; VI-LABEL: vsub64rr:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v4, s3
-; VI-NEXT: v_sub_u32_e32 v3, vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v1, s0
-; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v4, s7
+; VI-NEXT: v_sub_u32_e32 v3, vcc, s6, v0
+; VI-NEXT: v_mov_b32_e32 v1, s4
+; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
; VI-NEXT: flat_store_dwordx2 v[1:2], v[3:4]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: vsub64rr:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s6, v0
; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: vsub64rr:
; GFX1010: ; %bb.0: ; %entry
-; GFX1010-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1010-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT: v_sub_co_u32 v0, s2, s2, v0
-; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s2, s3, 0, s2
-; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-NEXT: v_sub_co_u32 v0, s0, s6, v0
+; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s0, s7, 0, s0
+; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: vsub64rr:
; GFX1030W32: ; %bb.0: ; %entry
-; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: v_sub_co_u32 v0, s2, s2, v0
-; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, 0, s2
-; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W32-NEXT: v_sub_co_u32 v0, s0, s6, v0
+; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, null, s7, 0, s0
+; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: vsub64rr:
; GFX1030W64: ; %bb.0: ; %entry
-; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: v_sub_co_u32 v0, s[4:5], s2, v0
-; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, 0, s[4:5]
-; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W64-NEXT: v_sub_co_u32 v0, s[0:1], s6, v0
+; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, null, s7, 0, s[0:1]
+; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: vsub64rr:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_sub_co_u32 v0, s2, s2, v0
+; GFX11-NEXT: v_sub_co_u32 v0, s0, s6, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, 0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, s7, 0, s0
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1219,65 +1220,66 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) {
;
; VI-LABEL: vsub64ri:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_sub_u32_e32 v0, vcc, 0x56789876, v0
; VI-NEXT: v_mov_b32_e32 v1, 0x1234
; VI-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: vsub64ri:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, 0x56789876, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x1234
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: vsub64ri:
; GFX1010: ; %bb.0: ; %entry
-; GFX1010-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1010-NEXT: v_sub_co_u32 v0, s2, 0x56789876, v0
+; GFX1010-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1010-NEXT: s_mov_b32 null, 0
+; GFX1010-NEXT: v_sub_co_u32 v0, s0, 0x56789876, v0
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
-; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s2, 0x1234, 0, s2
+; GFX1010-NEXT: v_sub_co_ci_u32_e64 v1, s0, 0x1234, 0, s0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: vsub64ri:
; GFX1030W32: ; %bb.0: ; %entry
-; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1030W32-NEXT: v_sub_co_u32 v0, s2, 0x56789876, v0
+; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1030W32-NEXT: v_sub_co_u32 v0, s0, 0x56789876, v0
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
-; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s2
+; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: vsub64ri:
; GFX1030W64: ; %bb.0: ; %entry
-; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1030W64-NEXT: v_sub_co_u32 v0, s[2:3], 0x56789876, v0
+; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1030W64-NEXT: v_sub_co_u32 v0, s[0:1], 0x56789876, v0
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
-; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s[2:3]
+; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s[0:1]
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: vsub64ri:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: v_sub_co_u32 v0, s2, 0x56789876, v0
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: v_sub_co_u32 v0, s0, 0x56789876, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s2
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, 0x1234, 0, s0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1310,12 +1312,12 @@ define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %car
; VI-LABEL: susubo32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_sub_i32 s2, s2, s3
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_sub_i32 s0, s2, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1346,36 +1348,36 @@ define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W32: ; %bb.0:
; GFX1030W32-NEXT: s_clause 0x1
; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX1030W32-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v0, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: s_sub_i32 s2, s2, s3
-; GFX1030W32-NEXT: v_mov_b32_e32 v1, s2
-; GFX1030W32-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1030W32-NEXT: s_sub_i32 s0, s2, s3
+; GFX1030W32-NEXT: v_mov_b32_e32 v1, s0
+; GFX1030W32-NEXT: global_store_dword v0, v1, s[4:5]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: susubo32:
; GFX1030W64: ; %bb.0:
; GFX1030W64-NEXT: s_clause 0x1
; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX1030W64-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v0, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: s_sub_i32 s2, s2, s3
-; GFX1030W64-NEXT: v_mov_b32_e32 v1, s2
-; GFX1030W64-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1030W64-NEXT: s_sub_i32 s0, s2, s3
+; GFX1030W64-NEXT: v_mov_b32_e32 v1, s0
+; GFX1030W64-NEXT: global_store_dword v0, v1, s[4:5]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: susubo32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sub_i32 s2, s2, s3
+; GFX11-NEXT: s_sub_i32 s0, s2, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1420,12 +1422,12 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
; VI-LABEL: usubo32_vcc_user:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: v_mov_b32_e32 v4, s3
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_sub_u32_e32 v4, vcc, s0, v4
+; VI-NEXT: v_sub_u32_e32 v4, vcc, s2, v4
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
@@ -1462,42 +1464,42 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
; GFX1030W32-LABEL: usubo32_vcc_user:
; GFX1030W32: ; %bb.0:
; GFX1030W32-NEXT: s_clause 0x1
-; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v0, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: v_sub_co_u32 v1, s4, s4, s5
-; GFX1030W32-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
-; GFX1030W32-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX1030W32-NEXT: global_store_byte v0, v2, s[2:3]
+; GFX1030W32-NEXT: v_sub_co_u32 v1, s0, s2, s3
+; GFX1030W32-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX1030W32-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX1030W32-NEXT: global_store_byte v0, v2, s[6:7]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: usubo32_vcc_user:
; GFX1030W64: ; %bb.0:
; GFX1030W64-NEXT: s_clause 0x1
-; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v0, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: v_sub_co_u32 v1, s[4:5], s4, s5
-; GFX1030W64-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5]
-; GFX1030W64-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX1030W64-NEXT: global_store_byte v0, v2, s[2:3]
+; GFX1030W64-NEXT: v_sub_co_u32 v1, s[0:1], s2, s3
+; GFX1030W64-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX1030W64-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX1030W64-NEXT: global_store_byte v0, v2, s[6:7]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: usubo32_vcc_user:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_sub_co_u32 v1, s4, s4, s5
+; GFX11-NEXT: v_sub_co_u32 v1, s0, s2, s3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-NEXT: global_store_b8 v0, v2, s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1541,19 +1543,19 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
;
; VI-LABEL: susubo64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_sub_u32 s0, s4, s6
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_subb_u32 s1, s5, s7
-; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: s_sub_u32 s0, s8, s10
+; VI-NEXT: v_mov_b32_e32 v4, s8
+; VI-NEXT: s_subb_u32 s1, s9, s11
+; VI-NEXT: v_mov_b32_e32 v5, s9
; VI-NEXT: v_mov_b32_e32 v7, s1
; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v6, s0
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx2 v[0:1], v[6:7]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; VI-NEXT: flat_store_byte v[2:3], v0
@@ -1561,80 +1563,80 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
;
; GFX9-LABEL: susubo64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sub_u32 s6, s4, s6
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_subb_u32 s7, s5, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: s_sub_u32 s0, s8, s10
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_subb_u32 s1, s9, s11
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
-; GFX9-NEXT: global_store_byte v4, v0, s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
+; GFX9-NEXT: global_store_byte v4, v0, s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX1010-LABEL: susubo64:
; GFX1010: ; %bb.0:
-; GFX1010-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX1010-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX1010-NEXT: v_mov_b32_e32 v2, 0
; GFX1010-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1010-NEXT: s_sub_u32 s6, s4, s6
-; GFX1010-NEXT: s_subb_u32 s7, s5, s7
-; GFX1010-NEXT: v_mov_b32_e32 v0, s6
-; GFX1010-NEXT: v_cmp_gt_u64_e64 s4, s[6:7], s[4:5]
-; GFX1010-NEXT: v_mov_b32_e32 v1, s7
-; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
-; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1010-NEXT: global_store_byte v2, v3, s[2:3]
+; GFX1010-NEXT: s_sub_u32 s0, s8, s10
+; GFX1010-NEXT: s_subb_u32 s1, s9, s11
+; GFX1010-NEXT: v_mov_b32_e32 v0, s0
+; GFX1010-NEXT: v_mov_b32_e32 v1, s1
+; GFX1010-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], s[8:9]
+; GFX1010-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX1010-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1010-NEXT: global_store_byte v2, v3, s[6:7]
; GFX1010-NEXT: s_endpgm
;
; GFX1030W32-LABEL: susubo64:
; GFX1030W32: ; %bb.0:
-; GFX1030W32-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX1030W32-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: s_sub_u32 s6, s4, s6
-; GFX1030W32-NEXT: s_subb_u32 s7, s5, s7
-; GFX1030W32-NEXT: v_mov_b32_e32 v0, s6
-; GFX1030W32-NEXT: v_cmp_gt_u64_e64 s4, s[6:7], s[4:5]
-; GFX1030W32-NEXT: v_mov_b32_e32 v1, s7
-; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
-; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1030W32-NEXT: global_store_byte v2, v3, s[2:3]
+; GFX1030W32-NEXT: s_sub_u32 s0, s8, s10
+; GFX1030W32-NEXT: s_subb_u32 s1, s9, s11
+; GFX1030W32-NEXT: v_mov_b32_e32 v0, s0
+; GFX1030W32-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030W32-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], s[8:9]
+; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1030W32-NEXT: global_store_byte v2, v3, s[6:7]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: susubo64:
; GFX1030W64: ; %bb.0:
-; GFX1030W64-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX1030W64-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: s_sub_u32 s6, s4, s6
-; GFX1030W64-NEXT: s_subb_u32 s7, s5, s7
-; GFX1030W64-NEXT: v_mov_b32_e32 v0, s6
-; GFX1030W64-NEXT: v_cmp_gt_u64_e64 s[4:5], s[6:7], s[4:5]
-; GFX1030W64-NEXT: v_mov_b32_e32 v1, s7
-; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5]
-; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1030W64-NEXT: global_store_byte v2, v3, s[2:3]
+; GFX1030W64-NEXT: s_sub_u32 s0, s8, s10
+; GFX1030W64-NEXT: s_subb_u32 s1, s9, s11
+; GFX1030W64-NEXT: v_mov_b32_e32 v0, s0
+; GFX1030W64-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030W64-NEXT: v_cmp_gt_u64_e64 s[0:1], s[0:1], s[8:9]
+; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1030W64-NEXT: global_store_byte v2, v3, s[6:7]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: susubo64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sub_u32 s6, s4, s6
-; GFX11-NEXT: s_subb_u32 s7, s5, s7
-; GFX11-NEXT: v_mov_b32_e32 v0, s6
-; GFX11-NEXT: v_cmp_gt_u64_e64 s4, s[6:7], s[4:5]
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX11-NEXT: s_sub_u32 s0, s8, s10
+; GFX11-NEXT: s_subb_u32 s1, s9, s11
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_cmp_gt_u64_e64 s0, s[0:1], s[8:9]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT: global_store_b8 v2, v3, s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1678,13 +1680,13 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; VI-LABEL: vusubo64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s4
-; VI-NEXT: v_mov_b32_e32 v6, s1
-; VI-NEXT: v_sub_u32_e32 v5, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v6, s3
+; VI-NEXT: v_sub_u32_e32 v5, vcc, s2, v0
; VI-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
-; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[5:6]
+; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[5:6]
; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: v_mov_b32_e32 v3, s6
; VI-NEXT: v_mov_b32_e32 v4, s7
@@ -1726,48 +1728,48 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; GFX1030W32-LABEL: vusubo64:
; GFX1030W32: ; %bb.0:
; GFX1030W32-NEXT: s_clause 0x1
-; GFX1030W32-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030W32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030W32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W32-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W32-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W32-NEXT: v_sub_co_u32 v0, s6, s4, v0
-; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, null, s5, 0, s6
-; GFX1030W32-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX1030W32-NEXT: v_sub_co_u32 v0, s0, s2, v0
+; GFX1030W32-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, 0, s0
+; GFX1030W32-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
; GFX1030W32-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
-; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1030W32-NEXT: global_store_byte v2, v3, s[2:3]
+; GFX1030W32-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1030W32-NEXT: global_store_byte v2, v3, s[6:7]
; GFX1030W32-NEXT: s_endpgm
;
; GFX1030W64-LABEL: vusubo64:
; GFX1030W64: ; %bb.0:
; GFX1030W64-NEXT: s_clause 0x1
-; GFX1030W64-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030W64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030W64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030W64-NEXT: v_mov_b32_e32 v2, 0
; GFX1030W64-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030W64-NEXT: v_sub_co_u32 v0, s[6:7], s4, v0
-; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, null, s5, 0, s[6:7]
-; GFX1030W64-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
+; GFX1030W64-NEXT: v_sub_co_u32 v0, s[0:1], s2, v0
+; GFX1030W64-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, 0, s[0:1]
+; GFX1030W64-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1]
; GFX1030W64-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX1030W64-NEXT: global_store_byte v2, v3, s[2:3]
+; GFX1030W64-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX1030W64-NEXT: global_store_byte v2, v3, s[6:7]
; GFX1030W64-NEXT: s_endpgm
;
; GFX11-LABEL: vusubo64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_sub_co_u32 v0, s6, s4, v0
+; GFX11-NEXT: v_sub_co_u32 v0, s0, s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, s5, 0, s6
-; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, s3, 0, s0
+; GFX11-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1]
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT: global_store_b8 v2, v3, s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
index 84bd9b6f6c5d4..5c9762bc2fe63 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll
@@ -23,15 +23,15 @@ define amdgpu_kernel void @v_clamp_add_src_f32(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-LABEL: v_clamp_add_src_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f32_e64 v2, v3, 1.0 clamp
@@ -40,24 +40,24 @@ define amdgpu_kernel void @v_clamp_add_src_f32(ptr addrspace(1) %out, ptr addrsp
;
; GFX9-LABEL: v_clamp_add_src_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -95,15 +95,15 @@ define amdgpu_kernel void @v_clamp_multi_use_src_f32(ptr addrspace(1) %out, ptr
;
; GFX8-LABEL: v_clamp_multi_use_src_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -115,29 +115,29 @@ define amdgpu_kernel void @v_clamp_multi_use_src_f32(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_clamp_multi_use_src_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_max_f32_e64 v2, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v2, s[4:5]
; GFX9-NEXT: global_store_dword v[0:1], v1, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_multi_use_src_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_max_f32_e64 v2, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v2, s[4:5]
; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_nop 0
@@ -174,15 +174,15 @@ define amdgpu_kernel void @v_clamp_dbg_use_src_f32(ptr addrspace(1) %out, ptr ad
;
; GFX8-LABEL: v_clamp_dbg_use_src_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f32_e64 v2, v3, 1.0 clamp
@@ -191,24 +191,24 @@ define amdgpu_kernel void @v_clamp_dbg_use_src_f32(ptr addrspace(1) %out, ptr ad
;
; GFX9-LABEL: v_clamp_dbg_use_src_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_dbg_use_src_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -244,15 +244,15 @@ define amdgpu_kernel void @v_clamp_add_neg_src_f32(ptr addrspace(1) %out, ptr ad
;
; GFX8-LABEL: v_clamp_add_neg_src_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_floor_f32_e32 v2, v3
@@ -262,27 +262,27 @@ define amdgpu_kernel void @v_clamp_add_neg_src_f32(ptr addrspace(1) %out, ptr ad
;
; GFX9-LABEL: v_clamp_add_neg_src_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_floor_f32_e32 v1, v1
; GFX9-NEXT: v_max_f32_e64 v1, -v1, -v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_neg_src_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_floor_f32_e32 v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_max_f32_e64 v1, -v1, -v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -318,15 +318,15 @@ define amdgpu_kernel void @v_non_clamp_max_f32(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-LABEL: v_non_clamp_max_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -336,27 +336,27 @@ define amdgpu_kernel void @v_non_clamp_max_f32(ptr addrspace(1) %out, ptr addrsp
;
; GFX9-LABEL: v_non_clamp_max_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_max_f32_e32 v1, 0, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_non_clamp_max_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_max_f32_e32 v1, 0, v1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -389,15 +389,15 @@ define amdgpu_kernel void @v_clamp_add_src_f32_denormals(ptr addrspace(1) %out,
;
; GFX8-LABEL: v_clamp_add_src_f32_denormals:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f32_e64 v2, v3, 1.0 clamp
@@ -406,24 +406,24 @@ define amdgpu_kernel void @v_clamp_add_src_f32_denormals(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_clamp_add_src_f32_denormals:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_f32_denormals:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -459,15 +459,15 @@ define amdgpu_kernel void @v_clamp_add_src_f16_denorm(ptr addrspace(1) %out, ptr
;
; GFX8-LABEL: v_clamp_add_src_f16_denorm:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f16_e64 v2, v3, 1.0 clamp
@@ -476,24 +476,24 @@ define amdgpu_kernel void @v_clamp_add_src_f16_denorm(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_clamp_add_src_f16_denorm:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f16_e64 v1, v1, 1.0 clamp
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_f16_denorm:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f16_e64 v1, v1, 1.0 clamp
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -529,15 +529,15 @@ define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(ptr addrspace(1) %ou
;
; GFX8-LABEL: v_clamp_add_src_f16_no_denormals:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f16_e64 v2, v3, 1.0 clamp
@@ -546,24 +546,24 @@ define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(ptr addrspace(1) %ou
;
; GFX9-LABEL: v_clamp_add_src_f16_no_denormals:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f16_e64 v1, v1, 1.0 clamp
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_f16_no_denormals:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f16_e64 v1, v1, 1.0 clamp
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -598,15 +598,15 @@ define amdgpu_kernel void @v_clamp_add_src_v2f32(ptr addrspace(1) %out, ptr addr
;
; GFX8-LABEL: v_clamp_add_src_v2f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f32_e64 v0, v0, 1.0 clamp
@@ -616,26 +616,26 @@ define amdgpu_kernel void @v_clamp_add_src_v2f32(ptr addrspace(1) %out, ptr addr
;
; GFX9-LABEL: v_clamp_add_src_v2f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e64 v0, v0, 1.0 clamp
; GFX9-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_v2f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e64 v0, v0, 1.0 clamp
; GFX11-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -669,15 +669,15 @@ define amdgpu_kernel void @v_clamp_add_src_f64(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-LABEL: v_clamp_add_src_f64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 clamp
@@ -686,24 +686,24 @@ define amdgpu_kernel void @v_clamp_add_src_f64(ptr addrspace(1) %out, ptr addrsp
;
; GFX9-LABEL: v_clamp_add_src_f64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 clamp
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_f64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0 clamp
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -826,16 +826,16 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm(ptr addrspace(1) %out, p
;
; GFX8-LABEL: v_clamp_add_src_v2f16_denorm:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f16_e64 v2, v3, 1.0 clamp
@@ -846,24 +846,24 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm(ptr addrspace(1) %out, p
;
; GFX9-LABEL: v_clamp_add_src_v2f16_denorm:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_v2f16_denorm:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -905,16 +905,16 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_no_denormals(ptr addrspace(1) %
;
; GFX8-LABEL: v_clamp_add_src_v2f16_no_denormals:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f16_e64 v2, v3, 1.0 clamp
@@ -925,24 +925,24 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_no_denormals(ptr addrspace(1) %
;
; GFX9-LABEL: v_clamp_add_src_v2f16_no_denormals:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_v2f16_no_denormals:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0] clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -992,16 +992,16 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg(ptr addrspace(1) %ou
;
; GFX8-LABEL: v_clamp_add_src_v2f16_denorm_neg:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f16_e32 v2, 1.0, v3
@@ -1014,27 +1014,27 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg(ptr addrspace(1) %ou
;
; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_neg:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_neg:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1078,16 +1078,16 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_lo(ptr addrspace(1)
;
; GFX8-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f16_e32 v2, 1.0, v3
@@ -1099,27 +1099,27 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_lo(ptr addrspace(1)
;
; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_neg_lo:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1165,16 +1165,16 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_hi(ptr addrspace(1)
;
; GFX8-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f16_e64 v2, v3, 1.0 clamp
@@ -1186,27 +1186,27 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_neg_hi(ptr addrspace(1)
;
; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_neg_hi:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1251,16 +1251,16 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_shuf(ptr addrspace(1) %o
;
; GFX8-LABEL: v_clamp_add_src_v2f16_denorm_shuf:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f16_sdwa v2, v3, v4 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -1271,27 +1271,27 @@ define amdgpu_kernel void @v_clamp_add_src_v2f16_denorm_shuf(ptr addrspace(1) %o
;
; GFX9-LABEL: v_clamp_add_src_v2f16_denorm_shuf:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_add_src_v2f16_denorm_shuf:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1334,15 +1334,15 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f32_src(ptr addrspace(1) %ou
;
; GFX8-LABEL: v_no_clamp_add_src_v2f16_f32_src:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -1354,27 +1354,27 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f32_src(ptr addrspace(1) %ou
;
; GFX9-LABEL: v_no_clamp_add_src_v2f16_f32_src:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_no_clamp_add_src_v2f16_f32_src:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1419,16 +1419,16 @@ define amdgpu_kernel void @v_no_clamp_add_packed_src_f32(ptr addrspace(1) %out,
;
; GFX8-LABEL: v_no_clamp_add_packed_src_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -1440,27 +1440,27 @@ define amdgpu_kernel void @v_no_clamp_add_packed_src_f32(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_no_clamp_add_packed_src_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_no_clamp_add_packed_src_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1505,16 +1505,16 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %ou
;
; GFX8-LABEL: v_no_clamp_add_src_v2f16_f16_src:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_ushort v2, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f16_e64 v2, v2, 1.0 clamp
@@ -1523,30 +1523,30 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %ou
;
; GFX9-LABEL: v_no_clamp_add_src_v2f16_f16_src:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v1, s[2:3]
+; GFX9-NEXT: global_load_ushort v1, v1, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f16_e32 v1, 1.0, v1
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_no_clamp_add_src_v2f16_f16_src:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v1, s[2:3]
+; GFX11-NEXT: global_load_u16 v1, v1, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f16_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll
index 947284506a297..57e855f1ccc34 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp.ll
@@ -24,15 +24,15 @@ define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX8-LABEL: v_clamp_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp
@@ -41,37 +41,37 @@ define amdgpu_kernel void @v_clamp_f32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: v_clamp_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -105,15 +105,15 @@ define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace(
;
; GFX8-LABEL: v_clamp_neg_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f32_e64 v2, -v3, -v3 clamp
@@ -122,37 +122,37 @@ define amdgpu_kernel void @v_clamp_neg_f32(ptr addrspace(1) %out, ptr addrspace(
;
; GFX9-LABEL: v_clamp_neg_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e64 v1, -v1, -v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_neg_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, -v1, -v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_neg_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, -v1, -v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -187,15 +187,15 @@ define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspa
;
; GFX8-LABEL: v_clamp_negabs_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f32_e64 v2, -|v3|, -|v3| clamp
@@ -204,37 +204,37 @@ define amdgpu_kernel void @v_clamp_negabs_f32(ptr addrspace(1) %out, ptr addrspa
;
; GFX9-LABEL: v_clamp_negabs_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_negabs_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, -|v1|, -|v1| clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_negabs_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, -|v1|, -|v1| clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -273,15 +273,15 @@ define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-LABEL: v_clamp_negzero_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f32_e32 v2, 0.5, v3
@@ -292,43 +292,43 @@ define amdgpu_kernel void @v_clamp_negzero_f32(ptr addrspace(1) %out, ptr addrsp
;
; GFX9-LABEL: v_clamp_negzero_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 0.5, v1
; GFX9-NEXT: v_max_f32_e32 v1, 0x80000000, v1
; GFX9-NEXT: v_min_f32_e32 v1, 1.0, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_negzero_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 0.5, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_maxmin_f32 v1, v1, 0x80000000, 1.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_negzero_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_f32_e32 v1, 0.5, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maxmin_num_f32 v1, v1, 0x80000000, 1.0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -367,15 +367,15 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out,
;
; GFX8-LABEL: v_clamp_negzero_maybe_snan_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3
@@ -386,43 +386,43 @@ define amdgpu_kernel void @v_clamp_negzero_maybe_snan_f32(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_clamp_negzero_maybe_snan_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
; GFX9-NEXT: v_max_f32_e32 v1, 0x80000000, v1
; GFX9-NEXT: v_min_f32_e32 v1, 1.0, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_negzero_maybe_snan_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e32 v1, v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_maxmin_f32 v1, v1, 0x80000000, 1.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_negzero_maybe_snan_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_maxmin_num_f32 v1, v1, 0x80000000, 1.0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -461,15 +461,15 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr
;
; GFX8-LABEL: v_clamp_multi_use_max_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3
@@ -482,31 +482,31 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_clamp_multi_use_max_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
; GFX9-NEXT: v_max_f32_e32 v1, 0, v1
; GFX9-NEXT: v_min_f32_e32 v2, 1.0, v1
-; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v2, s[4:5]
; GFX9-NEXT: global_store_dword v[0:1], v1, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_multi_use_max_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e32 v1, v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_max_f32_e32 v1, 0, v1
; GFX11-NEXT: v_min_f32_e32 v2, 1.0, v1
-; GFX11-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v2, s[4:5]
; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_nop 0
@@ -515,16 +515,16 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr
;
; GFX12-LABEL: v_clamp_multi_use_max_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_max_num_f32_e32 v1, 0, v1
; GFX12-NEXT: v_min_num_f32_e32 v2, 1.0, v1
-; GFX12-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v2, s[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_store_b32 v[0:1], v1, off scope:SCOPE_SYS
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -563,15 +563,15 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX8-LABEL: v_clamp_f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_e64 v2, v3, v3 clamp
@@ -580,37 +580,37 @@ define amdgpu_kernel void @v_clamp_f16(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: v_clamp_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e64 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e64 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_f16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f16_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -645,15 +645,15 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace(
;
; GFX8-LABEL: v_clamp_neg_f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_e64 v2, -v3, -v3 clamp
@@ -662,37 +662,37 @@ define amdgpu_kernel void @v_clamp_neg_f16(ptr addrspace(1) %out, ptr addrspace(
;
; GFX9-LABEL: v_clamp_neg_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e64 v1, -v1, -v1 clamp
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_neg_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1 clamp
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_neg_f16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f16_e64 v1, -v1, -v1 clamp
-; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -728,15 +728,15 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa
;
; GFX8-LABEL: v_clamp_negabs_f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_e64 v2, -|v3|, -|v3| clamp
@@ -745,37 +745,37 @@ define amdgpu_kernel void @v_clamp_negabs_f16(ptr addrspace(1) %out, ptr addrspa
;
; GFX9-LABEL: v_clamp_negabs_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e64 v1, -|v1|, -|v1| clamp
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_negabs_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e64 v1, -|v1|, -|v1| clamp
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_negabs_f16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f16_e64 v1, -|v1|, -|v1| clamp
-; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -812,15 +812,15 @@ define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX8-LABEL: v_clamp_f64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] clamp
@@ -829,37 +829,37 @@ define amdgpu_kernel void @v_clamp_f64(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: v_clamp_f64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] clamp
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_f64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] clamp
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_f64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX12-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f64_e64 v[0:1], v[0:1], v[0:1] clamp
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -893,15 +893,15 @@ define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace(
;
; GFX8-LABEL: v_clamp_neg_f64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] clamp
@@ -910,37 +910,37 @@ define amdgpu_kernel void @v_clamp_neg_f64(ptr addrspace(1) %out, ptr addrspace(
;
; GFX9-LABEL: v_clamp_neg_f64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] clamp
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_neg_f64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f64 v[0:1], -v[0:1], -v[0:1] clamp
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_neg_f64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX12-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f64_e64 v[0:1], -v[0:1], -v[0:1] clamp
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -975,15 +975,15 @@ define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspa
;
; GFX8-LABEL: v_clamp_negabs_f64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp
@@ -992,37 +992,37 @@ define amdgpu_kernel void @v_clamp_negabs_f64(ptr addrspace(1) %out, ptr addrspa
;
; GFX9-LABEL: v_clamp_negabs_f64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_negabs_f64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_negabs_f64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX12-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f64_e64 v[0:1], -|v[0:1]|, -|v[0:1]| clamp
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1060,16 +1060,16 @@ define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, p
;
; GFX8-LABEL: v_clamp_med3_aby_negzero_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_brev_b32 s0, 1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: s_brev_b32 s0, 1
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_med3_f32 v2, s0, 1.0, v3
@@ -1078,38 +1078,38 @@ define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(ptr addrspace(1) %out, p
;
; GFX9-LABEL: v_clamp_med3_aby_negzero_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_brev_b32 s0, 1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT: s_brev_b32 s2, 1
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_med3_f32 v1, s2, 1.0, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: v_med3_f32 v1, s0, 1.0, v1
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_med3_aby_negzero_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, 0x80000000, 1.0, v1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_med3_aby_negzero_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_med3_num_f32 v1, 0x80000000, 1.0, v1
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1141,15 +1141,15 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX8-LABEL: v_clamp_med3_aby_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp
@@ -1158,37 +1158,37 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX9-LABEL: v_clamp_med3_aby_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_med3_aby_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_med3_aby_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1220,15 +1220,15 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX8-LABEL: v_clamp_med3_bay_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp
@@ -1237,37 +1237,37 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX9-LABEL: v_clamp_med3_bay_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_med3_bay_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_med3_bay_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1299,15 +1299,15 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX8-LABEL: v_clamp_med3_yab_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp
@@ -1316,37 +1316,37 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX9-LABEL: v_clamp_med3_yab_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_med3_yab_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_med3_yab_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1378,15 +1378,15 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX8-LABEL: v_clamp_med3_yba_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp
@@ -1395,37 +1395,37 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX9-LABEL: v_clamp_med3_yba_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_med3_yba_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_med3_yba_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1457,15 +1457,15 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX8-LABEL: v_clamp_med3_ayb_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp
@@ -1474,37 +1474,37 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX9-LABEL: v_clamp_med3_ayb_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_med3_ayb_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_med3_ayb_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1536,15 +1536,15 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX8-LABEL: v_clamp_med3_bya_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp
@@ -1553,37 +1553,37 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32(ptr addrspace(1) %out, ptr addrs
;
; GFX9-LABEL: v_clamp_med3_bya_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_med3_bya_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_med3_bya_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1611,41 +1611,41 @@ define amdgpu_kernel void @v_clamp_constants_to_one_f32(ptr addrspace(1) %out) #
;
; GFX8-LABEL: v_clamp_constants_to_one_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v2, 1.0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_clamp_constants_to_one_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 1.0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_constants_to_one_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 1.0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_constants_to_one_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_dual_mov_b32 v1, 1.0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1670,41 +1670,41 @@ define amdgpu_kernel void @v_clamp_constants_to_zero_f32(ptr addrspace(1) %out)
;
; GFX8-LABEL: v_clamp_constants_to_zero_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_clamp_constants_to_zero_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_constants_to_zero_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_constants_to_zero_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1730,41 +1730,41 @@ define amdgpu_kernel void @v_clamp_constant_preserve_f32(ptr addrspace(1) %out)
;
; GFX8-LABEL: v_clamp_constant_preserve_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v2, 0.5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_clamp_constant_preserve_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0.5
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_constant_preserve_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0.5 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_constant_preserve_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_dual_mov_b32 v1, 0.5 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1790,41 +1790,41 @@ define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(ptr addrspace(1)
;
; GFX8-LABEL: v_clamp_constant_preserve_denorm_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fffff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_clamp_constant_preserve_denorm_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fffff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_constant_preserve_denorm_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0x7fffff :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_constant_preserve_denorm_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_dual_mov_b32 v1, 0x7fffff :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1849,41 +1849,41 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32(ptr addrspace(1) %out) #0 {
;
; GFX8-LABEL: v_clamp_constant_qnan_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_clamp_constant_qnan_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_constant_qnan_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_constant_qnan_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1908,41 +1908,41 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32(ptr addrspace(1) %out) #0 {
;
; GFX8-LABEL: v_clamp_constant_snan_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_clamp_constant_snan_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_constant_snan_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_constant_snan_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1977,15 +1977,15 @@ define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr
;
; GFX8-LABEL: v_clamp_f32_no_dx10_clamp:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f32_e32 v2, 0.5, v3
@@ -1995,40 +1995,40 @@ define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_clamp_f32_no_dx10_clamp:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 0.5, v1
; GFX9-NEXT: v_med3_f32 v1, v1, 0, 1.0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_f32_no_dx10_clamp:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 0.5, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_f32_no_dx10_clamp:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_f32_e64 v1, v1, 0.5 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2063,15 +2063,15 @@ define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr
;
; GFX8-LABEL: v_clamp_f32_snan_dx10clamp:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f32_e64 v2, v3, 0.5 clamp
@@ -2080,37 +2080,37 @@ define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_clamp_f32_snan_dx10clamp:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e64 v1, v1, 0.5 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_f32_snan_dx10clamp:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e64 v1, v1, 0.5 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_f32_snan_dx10clamp:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_f32_e64 v1, v1, 0.5 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2146,15 +2146,15 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out,
;
; GFX8-LABEL: v_clamp_f32_snan_no_dx10clamp:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v3
@@ -2164,40 +2164,40 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_clamp_f32_snan_no_dx10clamp:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
; GFX9-NEXT: v_med3_f32 v1, v1, 0, 1.0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_f32_snan_no_dx10clamp:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e32 v1, v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_f32_snan_no_dx10clamp:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2232,15 +2232,15 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace(
;
; GFX8-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -2250,40 +2250,40 @@ define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(ptr addrspace(
;
; GFX9-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_med3_f32 v1, v1, 0, 1.0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_f32_snan_no_dx10clamp_nnan_src:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_f32_e64 v1, v1, 1.0 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2318,15 +2318,15 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) %
;
; GFX8-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp
@@ -2335,37 +2335,37 @@ define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(ptr addrspace(1) %
;
; GFX9-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_med3_aby_f32_no_dx10_clamp:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2397,15 +2397,15 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) %
;
; GFX8-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f32_e64 v2, v3, v3 clamp
@@ -2414,37 +2414,37 @@ define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(ptr addrspace(1) %
;
; GFX9-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f32_e64 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_med3_bay_f32_no_dx10_clamp:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2476,15 +2476,15 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) %
;
; GFX8-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_med3_f32 v2, v3, 0, 1.0
@@ -2493,37 +2493,37 @@ define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(ptr addrspace(1) %
;
; GFX9-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, 0, 1.0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, 0, 1.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_med3_yab_f32_no_dx10_clamp:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2555,15 +2555,15 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) %
;
; GFX8-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_med3_f32 v2, v3, 1.0, 0
@@ -2572,37 +2572,37 @@ define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(ptr addrspace(1) %
;
; GFX9-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, 1.0, 0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, 1.0, 0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_med3_yba_f32_no_dx10_clamp:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2634,15 +2634,15 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) %
;
; GFX8-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_med3_f32 v2, 0, v3, 1.0
@@ -2651,37 +2651,37 @@ define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(ptr addrspace(1) %
;
; GFX9-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, 0, v1, 1.0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, 0, v1, 1.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_med3_ayb_f32_no_dx10_clamp:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2713,15 +2713,15 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) %
;
; GFX8-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_med3_f32 v2, 1.0, v3, 0
@@ -2730,37 +2730,37 @@ define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(ptr addrspace(1) %
;
; GFX9-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, 1.0, v1, 0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, 1.0, v1, 0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_med3_bya_f32_no_dx10_clamp:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2788,41 +2788,41 @@ define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(ptr addrspace
;
; GFX8-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7fc00000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0x7fc00000 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2848,41 +2848,41 @@ define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(ptr addrspace
;
; GFX8-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v2, 0x7f800001
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7f800001
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0x7f800001 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2918,15 +2918,15 @@ define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX8-LABEL: v_clamp_v2f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -2937,37 +2937,37 @@ define amdgpu_kernel void @v_clamp_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX9-LABEL: v_clamp_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_v2f16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3011,16 +3011,16 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad
;
; GFX8-LABEL: v_clamp_v2f16_undef_elt:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v4, 0x7e00
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -3035,37 +3035,37 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad
;
; GFX9-LABEL: v_clamp_v2f16_undef_elt:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_v2f16_undef_elt:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_v2f16_undef_elt:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3107,15 +3107,15 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add
;
; GFX8-LABEL: v_clamp_v2f16_not_zero:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_e32 v2, v3, v3
@@ -3128,45 +3128,45 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add
;
; GFX9-LABEL: v_clamp_v2f16_not_zero:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
; GFX9-NEXT: v_pk_max_f16 v1, v1, 2.0
; GFX9-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_v2f16_not_zero:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v1, v1, 2.0
; GFX11-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_v2f16_not_zero:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, 2.0
; GFX12-NEXT: v_pk_min_num_f16 v1, v1, 1.0 op_sel_hi:[1,0]
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3207,15 +3207,15 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr
;
; GFX8-LABEL: v_clamp_v2f16_not_one:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_e32 v2, v3, v3
@@ -3228,45 +3228,45 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr
;
; GFX9-LABEL: v_clamp_v2f16_not_one:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
; GFX9-NEXT: v_pk_max_f16 v1, v1, 0
; GFX9-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_v2f16_not_one:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v1, v1, 0
; GFX11-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_v2f16_not_one:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, 0
; GFX12-NEXT: v_pk_min_num_f16 v1, v1, 1.0 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3307,15 +3307,15 @@ define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspac
;
; GFX8-LABEL: v_clamp_neg_v2f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v2, -v3, -v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -3326,37 +3326,37 @@ define amdgpu_kernel void @v_clamp_neg_v2f16(ptr addrspace(1) %out, ptr addrspac
;
; GFX9-LABEL: v_clamp_neg_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_neg_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_neg_v2f16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3398,15 +3398,15 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrs
;
; GFX8-LABEL: v_clamp_negabs_v2f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v2, -|v3|, -|v3| clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -3417,42 +3417,42 @@ define amdgpu_kernel void @v_clamp_negabs_v2f16(ptr addrspace(1) %out, ptr addrs
;
; GFX9-LABEL: v_clamp_negabs_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_negabs_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_negabs_v2f16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 neg_lo:[1,1] neg_hi:[1,1] clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3496,15 +3496,15 @@ define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-LABEL: v_clamp_neglo_v2f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -3515,37 +3515,37 @@ define amdgpu_kernel void @v_clamp_neglo_v2f16(ptr addrspace(1) %out, ptr addrsp
;
; GFX9-LABEL: v_clamp_neglo_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_neglo_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_lo:[1,1] clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_neglo_v2f16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 neg_lo:[1,1] clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3588,15 +3588,15 @@ define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-LABEL: v_clamp_neghi_v2f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v2, -v3, -v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -3607,37 +3607,37 @@ define amdgpu_kernel void @v_clamp_neghi_v2f16(ptr addrspace(1) %out, ptr addrsp
;
; GFX9-LABEL: v_clamp_neghi_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_neghi_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 neg_hi:[1,1] clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_neghi_v2f16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 neg_hi:[1,1] clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3680,15 +3680,15 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addr
;
; GFX8-LABEL: v_clamp_v2f16_shuffle:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -3699,37 +3699,37 @@ define amdgpu_kernel void @v_clamp_v2f16_shuffle(ptr addrspace(1) %out, ptr addr
;
; GFX9-LABEL: v_clamp_v2f16_shuffle:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_v2f16_shuffle:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_v2f16_shuffle:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 op_sel:[1,1] op_sel_hi:[0,0] clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3774,16 +3774,16 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out
;
; GFX8-LABEL: v_clamp_v2f16_undef_limit_elts0:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v4, 0x3c00
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -3798,37 +3798,37 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out
;
; GFX9-LABEL: v_clamp_v2f16_undef_limit_elts0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_v2f16_undef_limit_elts0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_v2f16_undef_limit_elts0:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3872,16 +3872,16 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out
;
; GFX8-LABEL: v_clamp_v2f16_undef_limit_elts1:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v4, 0x7e00
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_max_f16_sdwa v2, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -3896,37 +3896,37 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out
;
; GFX9-LABEL: v_clamp_v2f16_undef_limit_elts1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_v2f16_undef_limit_elts1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_v2f16_undef_limit_elts1:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_max_num_f16 v1, v1, v1 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3961,70 +3961,70 @@ define amdgpu_kernel void @v_clamp_diff_source_f32(ptr addrspace(1) %out, ptr ad
;
; GFX8-LABEL: v_clamp_diff_source_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
-; GFX8-NEXT: s_load_dword s2, s[2:3], 0x8
-; GFX8-NEXT: s_add_u32 s0, s0, 12
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NEXT: s_load_dword s6, s[6:7], 0x8
+; GFX8-NEXT: s_add_u32 s2, s4, 12
+; GFX8-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s5
-; GFX8-NEXT: v_mov_b32_e32 v1, s2
-; GFX8-NEXT: v_add_f32_e32 v0, s4, v0
-; GFX8-NEXT: v_add_f32_e32 v1, s4, v1
+; GFX8-NEXT: v_mov_b32_e32 v0, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s6
+; GFX8-NEXT: v_add_f32_e32 v0, s0, v0
+; GFX8-NEXT: v_add_f32_e32 v1, s0, v1
; GFX8-NEXT: v_max_f32_e64 v2, v0, v1 clamp
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_clamp_diff_source_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
-; GFX9-NEXT: s_load_dword s6, s[2:3], 0x8
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-NEXT: s_load_dword s2, s[6:7], 0x8
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: v_add_f32_e32 v1, s4, v1
-; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_add_f32_e32 v1, s0, v1
+; GFX9-NEXT: v_add_f32_e32 v2, s0, v2
; GFX9-NEXT: v_max_f32_e64 v1, v1, v2 clamp
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:12
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5] offset:12
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_clamp_diff_source_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x8
+; GFX11-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
+; GFX11-NEXT: s_load_b32 s2, s[6:7], 0x8
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_add_f32_e64 v0, s4, s5
-; GFX11-NEXT: v_add_f32_e64 v1, s4, s2
+; GFX11-NEXT: v_add_f32_e64 v0, s0, s1
+; GFX11-NEXT: v_add_f32_e64 v1, s0, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_max_f32_e64 v0, v0, v1 clamp
-; GFX11-NEXT: global_store_b32 v2, v0, s[0:1] offset:12
+; GFX11-NEXT: global_store_b32 v2, v0, s[4:5] offset:12
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_clamp_diff_source_f32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b96 s[4:6], s[2:3], 0x0
+; GFX12-NEXT: s_load_b96 s[0:2], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_add_f32 s2, s4, s5
-; GFX12-NEXT: s_add_f32 s3, s4, s6
+; GFX12-NEXT: s_add_f32 s1, s0, s1
+; GFX12-NEXT: s_add_f32 s0, s0, s2
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_3)
-; GFX12-NEXT: s_max_num_f32 s2, s2, s3
-; GFX12-NEXT: v_max_num_f32_e64 v1, s2, s2 clamp
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] offset:12
+; GFX12-NEXT: s_max_num_f32 s0, s1, s0
+; GFX12-NEXT: v_max_num_f32_e64 v1, s0, s0 clamp
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5] offset:12
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
index 9c7fa1537c0c2..b969573c8ad8f 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
@@ -20,14 +20,14 @@ define amdgpu_kernel void @add1(ptr addrspace(1) nocapture %arg) {
;
; GFX9-LABEL: add1:
; GFX9: ; %bb.0: ; %bb
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v2, s[0:1]
+; GFX9-NEXT: global_load_dword v3, v2, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
bb:
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -103,14 +103,14 @@ define amdgpu_kernel void @sub1(ptr addrspace(1) nocapture %arg) {
;
; GFX9-LABEL: sub1:
; GFX9: ; %bb.0: ; %bb
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v2, s[0:1]
+; GFX9-NEXT: global_load_dword v3, v2, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_subbrev_co_u32_e32 v0, vcc, 0, v3, vcc
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
bb:
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -450,15 +450,15 @@ define amdgpu_kernel void @add_and(ptr addrspace(1) nocapture %arg) {
;
; GFX9-LABEL: add_and:
; GFX9: ; %bb.0: ; %bb
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: v_max_u32_e32 v1, 1, v1
; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v2, s[0:1]
+; GFX9-NEXT: global_load_dword v3, v2, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
bb:
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -493,14 +493,14 @@ define amdgpu_kernel void @cmp_sub_sext(ptr addrspace(1) nocapture %arg) {
;
; GFX9-LABEL: cmp_sub_sext:
; GFX9: ; %bb.0: ; %bb
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v2, s[0:1]
+; GFX9-NEXT: global_load_dword v3, v2, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
bb:
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -533,14 +533,14 @@ define amdgpu_kernel void @cmp_sub_zext(ptr addrspace(1) nocapture %arg) {
;
; GFX9-LABEL: cmp_sub_zext:
; GFX9: ; %bb.0: ; %bb
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v0, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v2, s[0:1]
+; GFX9-NEXT: global_load_dword v3, v2, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_subbrev_co_u32_e32 v0, vcc, 0, v3, vcc
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_endpgm
bb:
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll
index c27e44609c527..4b266d0647fbd 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll
@@ -4,13 +4,13 @@
define amdgpu_kernel void @vectorLoadCombine(ptr %in, ptr %out) {
; GCN-LABEL: vectorLoadCombine:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: flat_load_dword v2, v[0:1]
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: flat_store_dword v[0:1], v2
; GCN-NEXT: s_endpgm
@@ -37,14 +37,14 @@ entry:
define amdgpu_kernel void @vectorLoadShuffle(ptr %in, ptr %out) {
; GCN-LABEL: vectorLoadShuffle:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s0, 0x7050604
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: flat_load_dword v2, v[0:1]
-; GCN-NEXT: s_mov_b32 s0, 0x7050604
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN-NEXT: v_perm_b32 v2, v2, v2, s0
; GCN-NEXT: flat_store_dword v[0:1], v2
diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
index e9dbce9026ca0..52b9603e20932 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
@@ -26,17 +26,17 @@ define amdgpu_kernel void @test_copy_v4i8(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: test_copy_v4i8:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x
@@ -73,24 +73,23 @@ define amdgpu_kernel void @test_copy_v4i8_x2(ptr addrspace(1) %out0, ptr addrspa
; VI-LABEL: test_copy_v4i8_x2:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, s7
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
@@ -129,27 +128,27 @@ define amdgpu_kernel void @test_copy_v4i8_x3(ptr addrspace(1) %out0, ptr addrspa
;
; VI-LABEL: test_copy_v4i8_x3:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT: v_mov_b32_e32 v1, s11
+; VI-NEXT: v_add_u32_e32 v0, vcc, s10, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
-; VI-NEXT: s_mov_b32 s15, s11
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x
@@ -199,31 +198,30 @@ define amdgpu_kernel void @test_copy_v4i8_x4(ptr addrspace(1) %out0, ptr addrspa
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s15, s11
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s22, s10
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
-; VI-NEXT: s_mov_b32 s23, s11
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s20, s6
-; VI-NEXT: s_mov_b32 s21, s7
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s22, s2
+; VI-NEXT: s_mov_b32 s23, s3
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s20, s10
+; VI-NEXT: s_mov_b32 s21, s11
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0
; VI-NEXT: buffer_store_dword v0, off, s[16:19], 0
; VI-NEXT: buffer_store_dword v0, off, s[20:23], 0
@@ -280,22 +278,21 @@ define amdgpu_kernel void @test_copy_v4i8_extra_use(ptr addrspace(1) %out0, ptr
; VI-LABEL: test_copy_v4i8_extra_use:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, s7
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; VI-NEXT: v_and_b32_e32 v4, 0xffffff00, v1
@@ -310,7 +307,7 @@ define amdgpu_kernel void @test_copy_v4i8_extra_use(ptr addrspace(1) %out0, ptr
; VI-NEXT: v_add_u16_e32 v2, 0x900, v2
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v1, v2, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: buffer_store_dword v1, off, s[8:11], 0
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
@@ -365,23 +362,23 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(ptr addrspace(1) %out0, p
;
; VI-LABEL: test_copy_v4i8_x2_extra_use:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT: v_mov_b32_e32 v1, s11
+; VI-NEXT: v_add_u32_e32 v0, vcc, s10, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_mov_b32 s15, s11
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; VI-NEXT: v_and_b32_e32 v4, 0xffffff00, v1
@@ -396,9 +393,9 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(ptr addrspace(1) %out0, p
; VI-NEXT: v_add_u16_e32 v2, 0x900, v2
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v1, v2, v1
-; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: buffer_store_dword v1, off, s[12:15], 0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%in.ptr = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x
@@ -433,19 +430,19 @@ define amdgpu_kernel void @test_copy_v3i8_align4(ptr addrspace(1) %out, ptr addr
;
; VI-LABEL: test_copy_v3i8_align4:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
-; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2
+; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:2
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <3 x i8>, ptr addrspace(1) %in, i32 %tid.x
@@ -477,22 +474,22 @@ define amdgpu_kernel void @test_copy_v3i8_align2(ptr addrspace(1) %out, ptr addr
;
; VI-LABEL: test_copy_v3i8_align2:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:2
; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:2
+; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:2
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: buffer_store_short v1, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v1, off, s[0:3], 0
; VI-NEXT: s_endpgm
%val = load <3 x i8>, ptr addrspace(1) %in, align 2
store <3 x i8> %val, ptr addrspace(1) %out, align 2
@@ -525,24 +522,24 @@ define amdgpu_kernel void @test_copy_v3i8_align1(ptr addrspace(1) %out, ptr addr
;
; VI-LABEL: test_copy_v3i8_align1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(1)
-; VI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:2
+; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2
; VI-NEXT: v_lshrrev_b16_e32 v0, 8, v0
-; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:1
+; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:1
; VI-NEXT: s_endpgm
%val = load <3 x i8>, ptr addrspace(1) %in, align 1
store <3 x i8> %val, ptr addrspace(1) %out, align 1
@@ -569,19 +566,19 @@ define amdgpu_kernel void @test_copy_v4i8_volatile_load(ptr addrspace(1) %out, p
;
; VI-LABEL: test_copy_v4i8_volatile_load:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%val = load volatile <4 x i8>, ptr addrspace(1) %in, align 4
store <4 x i8> %val, ptr addrspace(1) %out, align 4
@@ -618,28 +615,28 @@ define amdgpu_kernel void @test_copy_v4i8_volatile_store(ptr addrspace(1) %out,
;
; VI-LABEL: test_copy_v4i8_volatile_store:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:3
; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2
; VI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:1
; VI-NEXT: buffer_load_ubyte v3, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(3)
-; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0 offset:3
+; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:3
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:2
+; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_byte v2, off, s[4:7], 0 offset:1
+; VI-NEXT: buffer_store_byte v2, off, s[0:3], 0 offset:1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_byte v3, off, s[4:7], 0
+; VI-NEXT: buffer_store_byte v3, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%val = load <4 x i8>, ptr addrspace(1) %in, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll b/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll
index 7dd95a02f136b..f10fe68eac789 100644
--- a/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll
@@ -8,21 +8,21 @@ define amdgpu_kernel void @copy_to_scc(ptr addrspace(1) %out, ptr addrspace(1) %
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GCN-NEXT: s_nop 0
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:252
-; GCN-NEXT: s_load_dword s2, s[2:3], 0x0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_cmp_lg_u32 s2, 0
-; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GCN-NEXT: s_xor_b64 s[2:3], s[2:3], vcc
-; GCN-NEXT: s_and_b64 s[2:3], s[2:3], exec
-; GCN-NEXT: s_cselect_b32 s2, 2, 3
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_load_dword s0, s[6:7], 0x0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_cmp_lg_u32 s0, 0
+; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], vcc
+; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GCN-NEXT: s_cselect_b32 s0, 2, 3
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: global_store_dword v1, v0, s[4:5]
; GCN-NEXT: s_endpgm
entry: ; preds = %1009
%0 = load i32, ptr addrspace(1) %in, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index 332b6013803cf..848ac3b50f1e2 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -36,15 +36,15 @@ define amdgpu_kernel void @s_ctlz_i32(ptr addrspace(1) noalias %out, i32 %val) n
;
; VI-LABEL: s_ctlz_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_flbit_i32_b32 s4, s4
-; VI-NEXT: s_min_u32 s4, s4, 32
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_flbit_i32_b32 s0, s2
+; VI-NEXT: s_min_u32 s0, s0, 32
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_ctlz_i32:
@@ -88,14 +88,14 @@ define amdgpu_kernel void @s_ctlz_i32(ptr addrspace(1) noalias %out, i32 %val) n
; GFX11-LABEL: s_ctlz_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_clz_i32_u32 s2, s2
+; GFX11-NEXT: s_clz_i32_u32 s0, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_min_u32 s2, s2, 32
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_min_u32 s0, s0, 32
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -612,16 +612,16 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32],
;
; VI-LABEL: s_ctlz_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_flbit_i32_b64 s4, s[4:5]
-; VI-NEXT: s_min_u32 s4, s4, 64
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_flbit_i32_b64 s0, s[2:3]
+; VI-NEXT: s_min_u32 s0, s0, 64
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_ctlz_i64:
@@ -674,13 +674,13 @@ define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32],
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_clz_i32_u64 s2, s[2:3]
+; GFX11-NEXT: s_clz_i32_u64 s0, s[2:3]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_min_u32 s2, s2, 64
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX11-NEXT: s_min_u32 s0, s0, 64
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
+; GFX11-NEXT: global_store_b64 v1, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 97529b5687a71..2dd3a7bd6b5bd 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -41,13 +41,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out,
;
; VI-LABEL: s_ctlz_zero_undef_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_flbit_i32_b32 s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_flbit_i32_b32 s0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -331,14 +331,14 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
;
; VI-LABEL: s_ctlz_zero_undef_i8_with_select:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b32 s2, s2, 24
-; VI-NEXT: s_flbit_i32_b32 s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_lshl_b32 s0, s4, 24
+; VI-NEXT: s_flbit_i32_b32 s0, s0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_byte v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -405,15 +405,15 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
;
; VI-LABEL: s_ctlz_zero_undef_i16_with_select:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0xffff
-; VI-NEXT: s_flbit_i32_b32 s2, s2
-; VI-NEXT: s_add_i32 s2, s2, -16
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0xffff
+; VI-NEXT: s_flbit_i32_b32 s0, s0
+; VI-NEXT: s_add_i32 s0, s0, -16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -479,13 +479,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no
;
; VI-LABEL: s_ctlz_zero_undef_i32_with_select:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_flbit_i32_b32 s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_flbit_i32_b32 s0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1198,13 +1198,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out,
; VI-LABEL: s_ctlz_zero_undef_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_flbit_i32_b64 s2, s[2:3]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_flbit_i32_b64 s0, s[2:3]
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -2218,19 +2218,19 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out,
;
; VI-LABEL: s_ctlz_zero_undef_i18:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0x3ffff
-; VI-NEXT: s_flbit_i32_b32 s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_add_i32 s2, s2, -14
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: s_add_u32 s0, s0, 2
+; VI-NEXT: s_and_b32 s0, s4, 0x3ffff
+; VI-NEXT: s_flbit_i32_b32 s0, s0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: s_add_i32 s4, s0, -14
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: s_add_u32 s0, s2, 2
; VI-NEXT: flat_store_short v[0:1], v2
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: s_bfe_u32 s2, s2, 0x20010
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: s_bfe_u32 s2, s4, 0x20010
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
index 4f2bde8d0842b..6e39b83e155de 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
@@ -27,15 +27,15 @@ define amdgpu_kernel void @s_ctpop_i16(ptr addrspace(1) noalias %out, i16 %val)
;
; VI-LABEL: s_ctpop_i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s4, s4, 0xffff
-; VI-NEXT: s_bcnt1_i32_b32 s4, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT: s_and_b32 s0, s2, 0xffff
+; VI-NEXT: s_bcnt1_i32_b32 s0, s0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_ctpop_i16:
@@ -167,14 +167,14 @@ define amdgpu_kernel void @v_ctpop_add_chain_i16(ptr addrspace(1) noalias %out,
; VI-LABEL: v_ctpop_add_chain_i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_ushort v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1423,15 +1423,15 @@ define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(ptr addrspace(1) noalias %ou
; VI-LABEL: v_ctpop_i16_add_vvar_inv:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v0, v[0:1]
; VI-NEXT: s_mov_b32 s7, 0xf000
@@ -1521,29 +1521,29 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(
;
; VI-LABEL: ctpop_i16_in_br:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s5, s4, 16
-; VI-NEXT: s_cmp_lg_u32 s5, 0
+; VI-NEXT: s_lshr_b32 s0, s2, 16
+; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_cbranch_scc0 .LBB14_4
; VI-NEXT: ; %bb.1: ; %else
; VI-NEXT: s_mov_b32 s11, 0xf000
; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 offset:2
; VI-NEXT: s_cbranch_execnz .LBB14_3
; VI-NEXT: .LBB14_2: ; %if
-; VI-NEXT: s_and_b32 s2, s4, 0xffff
-; VI-NEXT: s_bcnt1_i32_b32 s2, s2
+; VI-NEXT: s_and_b32 s0, s2, 0xffff
+; VI-NEXT: s_bcnt1_i32_b32 s0, s0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: .LBB14_3: ; %endif
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
; VI-NEXT: .LBB14_4:
; VI-NEXT: ; implicit-def: $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
index 633f12047e5b1..bd451dc5cd8fb 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
@@ -28,14 +28,14 @@ define amdgpu_kernel void @s_ctpop_i64(ptr addrspace(1) noalias %out, [8 x i32],
;
; VI-LABEL: s_ctpop_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
%truncctpop = trunc i64 %ctpop to i32
@@ -116,7 +116,7 @@ define amdgpu_kernel void @v_ctpop_i64_user(ptr addrspace(1) noalias %out, ptr a
; VI-LABEL: v_ctpop_i64_user:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
@@ -128,8 +128,8 @@ define amdgpu_kernel void @v_ctpop_i64_user(ptr addrspace(1) noalias %out, ptr a
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_bcnt_u32_b32 v0, v0, 0
; VI-NEXT: v_bcnt_u32_b32 v0, v1, v0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_or_b32_e32 v0, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_or_b32_e32 v0, s2, v0
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -159,15 +159,15 @@ define amdgpu_kernel void @s_ctpop_v2i64(ptr addrspace(1) noalias %out, <2 x i64
; VI-LABEL: s_ctpop_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s11, 0xf000
+; VI-NEXT: s_mov_b32 s10, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
-; VI-NEXT: s_bcnt1_i32_b64 s5, s[6:7]
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; VI-NEXT: s_bcnt1_i32_b64 s1, s[6:7]
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
; VI-NEXT: s_endpgm
%ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
%truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
@@ -197,19 +197,19 @@ define amdgpu_kernel void @s_ctpop_v4i64(ptr addrspace(1) noalias %out, <4 x i64
; VI-LABEL: s_ctpop_v4i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s15, 0xf000
+; VI-NEXT: s_mov_b32 s14, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
-; VI-NEXT: s_bcnt1_i32_b64 s5, s[6:7]
-; VI-NEXT: s_bcnt1_i32_b64 s6, s[8:9]
-; VI-NEXT: s_bcnt1_i32_b64 s7, s[10:11]
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; VI-NEXT: s_bcnt1_i32_b64 s1, s[6:7]
+; VI-NEXT: s_bcnt1_i32_b64 s2, s[8:9]
+; VI-NEXT: s_bcnt1_i32_b64 s3, s[10:11]
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
; VI-NEXT: s_endpgm
%ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
%truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
@@ -424,15 +424,15 @@ define amdgpu_kernel void @s_ctpop_i128(ptr addrspace(1) noalias %out, i128 %val
; VI-LABEL: s_ctpop_i128:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s11, 0xf000
+; VI-NEXT: s_mov_b32 s10, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bcnt1_i32_b64 s6, s[6:7]
-; VI-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
-; VI-NEXT: s_add_i32 s4, s4, s6
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_bcnt1_i32_b64 s0, s[6:7]
+; VI-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
+; VI-NEXT: s_add_i32 s0, s1, s0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
%ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone
%truncctpop = trunc i128 %ctpop to i32
diff --git a/llvm/test/CodeGen/AMDGPU/cttz.ll b/llvm/test/CodeGen/AMDGPU/cttz.ll
index 483402d4778d1..e1b01c0389ddb 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz.ll
@@ -35,15 +35,15 @@ define amdgpu_kernel void @s_cttz_i32(ptr addrspace(1) noalias %out, i32 %val) n
;
; VI-LABEL: s_cttz_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ff1_i32_b32 s4, s4
-; VI-NEXT: s_min_u32 s4, s4, 32
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_ff1_i32_b32 s0, s2
+; VI-NEXT: s_min_u32 s0, s0, 32
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_cttz_i32:
@@ -519,16 +519,16 @@ define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32],
;
; VI-LABEL: s_cttz_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ff1_i32_b64 s4, s[4:5]
-; VI-NEXT: s_min_u32 s4, s4, 64
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_ff1_i32_b64 s0, s[2:3]
+; VI-NEXT: s_min_u32 s0, s0, 64
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_cttz_i64:
diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index a6cbfa52db532..7eb2e52acef07 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -28,13 +28,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32(ptr addrspace(1) noalias %out,
;
; VI-LABEL: s_cttz_zero_undef_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ff1_i32_b32 s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_ff1_i32_b32 s0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -317,13 +317,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa
;
; VI-LABEL: s_cttz_zero_undef_i8_with_select:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ff1_i32_b32 s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_ff1_i32_b32 s0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_byte v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -386,13 +386,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(ptr addrspace(1) no
;
; VI-LABEL: s_cttz_zero_undef_i16_with_select:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ff1_i32_b32 s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_ff1_i32_b32 s0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -455,13 +455,13 @@ define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(ptr addrspace(1) no
;
; VI-LABEL: s_cttz_zero_undef_i32_with_select:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ff1_i32_b32 s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_ff1_i32_b32 s0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index fd4e182f6804e..e6d68a1bb15e1 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -2788,36 +2788,36 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr
;
; VI-LABEL: cvt_ubyte0_or_multiuse:
; VI: ; %bb.0: ; %bb
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_e32 v0, 0x80000001, v0
; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v0
; VI-NEXT: v_add_f32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: cvt_ubyte0_or_multiuse:
; GFX10: ; %bb.0: ; %bb
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[0:1]
+; GFX10-NEXT: global_load_dword v0, v0, s[4:5]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_or_b32_e32 v0, 0x80000001, v0
; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v0
; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT: global_store_dword v2, v0, s[2:3]
+; GFX10-NEXT: global_store_dword v2, v0, s[6:7]
; GFX10-NEXT: s_endpgm
;
; GFX9-LABEL: cvt_ubyte0_or_multiuse:
@@ -2836,17 +2836,17 @@ define amdgpu_kernel void @cvt_ubyte0_or_multiuse(ptr addrspace(1) %in, ptr addr
;
; GFX11-LABEL: cvt_ubyte0_or_multiuse:
; GFX11: ; %bb.0: ; %bb
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-NEXT: global_load_b32 v0, v0, s[4:5]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_or_b32_e32 v0, 0x80000001, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, v0
; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: global_store_b32 v2, v0, s[2:3]
+; GFX11-NEXT: global_store_b32 v2, v0, s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
index fed4b9862dbfb..37b4dfac0bb9d 100644
--- a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
+++ b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll
@@ -8,13 +8,13 @@
define protected amdgpu_kernel void @add(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: add:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_add v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_add v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -30,13 +30,13 @@ define protected amdgpu_kernel void @add(ptr addrspace(1) %p, ptr addrspace(1) %
define protected amdgpu_kernel void @sub(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: sub:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_sub v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_sub v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -52,13 +52,13 @@ define protected amdgpu_kernel void @sub(ptr addrspace(1) %p, ptr addrspace(1) %
define protected amdgpu_kernel void @and(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: and:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_and v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_and v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -74,13 +74,13 @@ define protected amdgpu_kernel void @and(ptr addrspace(1) %p, ptr addrspace(1) %
define protected amdgpu_kernel void @or(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: or:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_or v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_or v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -96,13 +96,13 @@ define protected amdgpu_kernel void @or(ptr addrspace(1) %p, ptr addrspace(1) %q
define protected amdgpu_kernel void @xor(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: xor:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_xor v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_xor v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -118,28 +118,28 @@ define protected amdgpu_kernel void @xor(ptr addrspace(1) %p, ptr addrspace(1) %
define protected amdgpu_kernel void @nand(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: nand:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; CHECK-NEXT: s_mov_b64 s[0:1], 0
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
+; CHECK-NEXT: s_load_dword s2, s[4:5], 0x0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v0, s2
; CHECK-NEXT: .LBB5_1: ; %atomicrmw.start
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_mov_b32_e32 v3, v0
; CHECK-NEXT: v_not_b32_e32 v0, v3
; CHECK-NEXT: v_or_b32_e32 v2, -2, v0
-; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
+; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
-; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; CHECK-NEXT: s_andn2_b64 exec, exec, s[0:1]
; CHECK-NEXT: s_cbranch_execnz .LBB5_1
; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
-; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
-; CHECK-NEXT: v_mov_b32_e32 v2, s2
-; CHECK-NEXT: v_mov_b32_e32 v3, s3
+; CHECK-NEXT: s_or_b64 exec, exec, s[0:1]
+; CHECK-NEXT: v_mov_b32_e32 v2, s6
+; CHECK-NEXT: v_mov_b32_e32 v3, s7
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: global_store_dword v[0:1], v2, off
@@ -154,13 +154,13 @@ define protected amdgpu_kernel void @nand(ptr addrspace(1) %p, ptr addrspace(1)
define protected amdgpu_kernel void @max_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: max_workgroup:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_smax v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_smax v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -176,13 +176,13 @@ define protected amdgpu_kernel void @max_workgroup(ptr addrspace(1) %p, ptr addr
define protected amdgpu_kernel void @max(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: max:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_smax v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_smax v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -198,13 +198,13 @@ define protected amdgpu_kernel void @max(ptr addrspace(1) %p, ptr addrspace(1) %
define protected amdgpu_kernel void @min_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: min_workgroup:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_smin v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_smin v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -220,13 +220,13 @@ define protected amdgpu_kernel void @min_workgroup(ptr addrspace(1) %p, ptr addr
define protected amdgpu_kernel void @min(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: min:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_smin v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_smin v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -242,13 +242,13 @@ define protected amdgpu_kernel void @min(ptr addrspace(1) %p, ptr addrspace(1) %
define protected amdgpu_kernel void @umax_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: umax_workgroup:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_umax v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_umax v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -264,13 +264,13 @@ define protected amdgpu_kernel void @umax_workgroup(ptr addrspace(1) %p, ptr add
define protected amdgpu_kernel void @umax(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: umax:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_umax v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_umax v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -286,13 +286,13 @@ define protected amdgpu_kernel void @umax(ptr addrspace(1) %p, ptr addrspace(1)
define protected amdgpu_kernel void @umin_workgroup(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: umin_workgroup:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_umin v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_umin v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -308,13 +308,13 @@ define protected amdgpu_kernel void @umin_workgroup(ptr addrspace(1) %p, ptr add
define protected amdgpu_kernel void @umin(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: umin:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_umin v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_umin v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -330,14 +330,14 @@ define protected amdgpu_kernel void @umin(ptr addrspace(1) %p, ptr addrspace(1)
define protected amdgpu_kernel void @cmpxchg(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: cmpxchg:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: v_mov_b32_e32 v0, 2
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_cmpswap v2, v2, v[0:1], s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_cmpswap v2, v2, v[0:1], s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -354,13 +354,13 @@ define protected amdgpu_kernel void @cmpxchg(ptr addrspace(1) %p, ptr addrspace(
define protected amdgpu_kernel void @xchg(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: xchg:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_swap v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_swap v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -376,13 +376,13 @@ define protected amdgpu_kernel void @xchg(ptr addrspace(1) %p, ptr addrspace(1)
define protected amdgpu_kernel void @inc(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: inc:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_inc v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_inc v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -398,13 +398,13 @@ define protected amdgpu_kernel void @inc(ptr addrspace(1) %p, ptr addrspace(1) %
define protected amdgpu_kernel void @dec(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: dec:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_dec v2, v0, v1, s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: global_atomic_dec v2, v0, v1, s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
@@ -420,28 +420,28 @@ define protected amdgpu_kernel void @dec(ptr addrspace(1) %p, ptr addrspace(1) %
define protected amdgpu_kernel void @fadd(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: fadd:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; CHECK-NEXT: s_mov_b64 s[0:1], 0
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
+; CHECK-NEXT: s_load_dword s2, s[4:5], 0x0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v0, s2
; CHECK-NEXT: .LBB18_1: ; %atomicrmw.start
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_mov_b32_e32 v3, v0
; CHECK-NEXT: v_add_f32_e32 v2, 1.0, v3
-; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
+; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
-; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; CHECK-NEXT: s_andn2_b64 exec, exec, s[0:1]
; CHECK-NEXT: s_cbranch_execnz .LBB18_1
; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
-; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_or_b64 exec, exec, s[0:1]
; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v0
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: global_store_dword v[0:1], v2, off
@@ -457,28 +457,28 @@ define protected amdgpu_kernel void @fadd(ptr addrspace(1) %p, ptr addrspace(1)
define protected amdgpu_kernel void @fsub(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: fsub:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; CHECK-NEXT: s_mov_b64 s[4:5], 0
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; CHECK-NEXT: s_mov_b64 s[0:1], 0
; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
+; CHECK-NEXT: s_load_dword s2, s[4:5], 0x0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v0, s2
; CHECK-NEXT: .LBB19_1: ; %atomicrmw.start
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_mov_b32_e32 v3, v0
; CHECK-NEXT: v_add_f32_e32 v2, -1.0, v3
-; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
+; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[4:5] glc
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
-; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; CHECK-NEXT: s_andn2_b64 exec, exec, s[0:1]
; CHECK-NEXT: s_cbranch_execnz .LBB19_1
; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
-; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT: s_or_b64 exec, exec, s[0:1]
; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v0
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: v_mov_b32_e32 v0, s6
+; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: global_store_dword v[0:1], v2, off
@@ -494,14 +494,14 @@ define protected amdgpu_kernel void @fsub(ptr addrspace(1) %p, ptr addrspace(1)
define protected amdgpu_kernel void @fmin(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: fmin:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_min_f64 v[0:1], v2, v[0:1], s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v2, s2
-; CHECK-NEXT: v_mov_b32_e32 v3, s3
+; CHECK-NEXT: global_atomic_min_f64 v[0:1], v2, v[0:1], s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v2, s6
+; CHECK-NEXT: v_mov_b32_e32 v3, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cvt_u32_f64_e32 v0, v[0:1]
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3]
@@ -519,14 +519,14 @@ define protected amdgpu_kernel void @fmin(ptr addrspace(1) %p, ptr addrspace(1)
define protected amdgpu_kernel void @fmax(ptr addrspace(1) %p, ptr addrspace(1) %q) {
; CHECK-LABEL: fmax:
; CHECK: ; %bb.0:
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v2, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_atomic_max_f64 v[0:1], v2, v[0:1], s[0:1] glc
-; CHECK-NEXT: v_mov_b32_e32 v2, s2
-; CHECK-NEXT: v_mov_b32_e32 v3, s3
+; CHECK-NEXT: global_atomic_max_f64 v[0:1], v2, v[0:1], s[4:5] glc
+; CHECK-NEXT: v_mov_b32_e32 v2, s6
+; CHECK-NEXT: v_mov_b32_e32 v3, s7
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cvt_u32_f64_e32 v0, v[0:1]
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3]
@@ -547,13 +547,13 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.swap(ptr addrspace(8) %rs
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: buffer_atomic_swap v0, v1, s[4:7], 0 offen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.swap.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -569,13 +569,13 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.add(ptr addrspace(8) %rsr
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: buffer_atomic_add v0, v1, s[4:7], 0 offen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -591,13 +591,13 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.sub(ptr addrspace(8) %rsr
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: buffer_atomic_sub v0, v1, s[4:7], 0 offen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.sub.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -613,13 +613,13 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.smin(ptr addrspace(8) %rs
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: buffer_atomic_smin v0, v1, s[4:7], 0 offen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.smin.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -635,13 +635,13 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.smax(ptr addrspace(8) %rs
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: buffer_atomic_smax v0, v1, s[4:7], 0 offen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.smax.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -657,13 +657,13 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.umin(ptr addrspace(8) %rs
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: buffer_atomic_umin v0, v1, s[4:7], 0 offen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.umin.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -679,13 +679,13 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.umax(ptr addrspace(8) %rs
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: buffer_atomic_umax v0, v1, s[4:7], 0 offen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.umax.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -701,13 +701,13 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.and(ptr addrspace(8) %rsr
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: buffer_atomic_and v0, v1, s[4:7], 0 offen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.and.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -723,13 +723,13 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.or(ptr addrspace(8) %rsrc
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: buffer_atomic_or v0, v1, s[4:7], 0 offen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.or.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -745,13 +745,13 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.xor(ptr addrspace(8) %rsr
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: buffer_atomic_xor v0, v1, s[4:7], 0 offen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.xor.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -767,13 +767,13 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.inc(ptr addrspace(8) %rsr
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: buffer_atomic_inc v0, v1, s[4:7], 0 offen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.inc.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -789,13 +789,13 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.dec(ptr addrspace(8) %rsr
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s2
; CHECK-NEXT: buffer_atomic_dec v0, v1, s[4:7], 0 offen glc
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.dec.i32(i32 1, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -812,13 +812,13 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.cmpswap(ptr addrspace(8)
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v1, 2
; CHECK-NEXT: v_mov_b32_e32 v0, 1
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v2, s2
; CHECK-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[4:7], 0 offen glc
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%n32 = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.i32(i32 1, i32 2, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -834,14 +834,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.fadd(ptr addrspace(8) %rs
; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v1, 1.0
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, s2
; CHECK-NEXT: buffer_atomic_add_f32 v1, v0, s[4:7], 0 offen glc
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v1
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%f32 = call float @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f32(float 1.0, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -859,14 +860,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.fmin(ptr addrspace(8) %rs
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v2, s2
; CHECK-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen glc
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cvt_u32_f64_e32 v0, v[0:1]
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%f64 = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double 1.0, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -884,14 +886,15 @@ define protected amdgpu_kernel void @buffer.ptr.atomic.fmax(ptr addrspace(8) %rs
; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000
-; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v2, s2
; CHECK-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen glc
+; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_cvt_u32_f64_e32 v0, v[0:1]
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
; CHECK-NEXT: global_store_dword v[0:1], v2, off
; CHECK-NEXT: s_endpgm
%f64 = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double 1.0, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
index 67b0cef92d4d3..cff77bfae8aab 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
@@ -42,13 +42,13 @@ define amdgpu_kernel void @uniform_vec_0_i16(ptr addrspace(1) %out, i16 %a) {
; GFX11-LABEL: uniform_vec_0_i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-NEXT: s_lshl_b32 s0, s4, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -127,13 +127,13 @@ define amdgpu_kernel void @uniform_vec_i16_0(ptr addrspace(1) %out, i16 %a) {
; GFX11-LABEL: uniform_vec_i16_0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT: s_and_b32 s0, 0xffff, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -212,13 +212,13 @@ define amdgpu_kernel void @uniform_vec_f16_0(ptr addrspace(1) %out, half %a) {
; GFX11-LABEL: uniform_vec_f16_0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT: s_and_b32 s0, 0xffff, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -277,12 +277,12 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa
;
; GFX9-LABEL: uniform_vec_i16_LL:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s5
+; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; use s0
; GFX9-NEXT: ;;#ASMEND
@@ -290,12 +290,12 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa
;
; GFX906-LABEL: uniform_vec_i16_LL:
; GFX906: ; %bb.0:
-; GFX906-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX906-NEXT: s_load_dword s5, s[2:3], 0x0
+; GFX906-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX906-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: s_pack_ll_b32_b16 s0, s4, s5
+; GFX906-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; use s0
; GFX906-NEXT: ;;#ASMEND
@@ -303,10 +303,10 @@ define amdgpu_kernel void @uniform_vec_i16_LL(ptr addrspace(4) %in0, ptr addrspa
;
; GFX11-LABEL: uniform_vec_i16_LL:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
-; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-NEXT: ;;#ASMSTART
@@ -561,12 +561,12 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa
;
; GFX9-LABEL: uniform_vec_f16_LL:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX9-NEXT: s_load_dword s5, s[2:3], 0x0
+; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s5
+; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; use s0
; GFX9-NEXT: ;;#ASMEND
@@ -574,12 +574,12 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa
;
; GFX906-LABEL: uniform_vec_f16_LL:
; GFX906: ; %bb.0:
-; GFX906-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: s_load_dword s4, s[0:1], 0x0
-; GFX906-NEXT: s_load_dword s5, s[2:3], 0x0
+; GFX906-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX906-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: s_pack_ll_b32_b16 s0, s4, s5
+; GFX906-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; use s0
; GFX906-NEXT: ;;#ASMEND
@@ -587,10 +587,10 @@ define amdgpu_kernel void @uniform_vec_f16_LL(ptr addrspace(4) %in0, ptr addrspa
;
; GFX11-LABEL: uniform_vec_f16_LL:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
-; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[4:5], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
; GFX11-NEXT: ;;#ASMSTART
@@ -723,13 +723,13 @@ define amdgpu_kernel void @build_vec_v2i16_undeflo_uniform(ptr addrspace(3) %in,
; GFX11-LABEL: build_vec_v2i16_undeflo_uniform:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x2c
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
; GFX11-NEXT: ds_load_u16_d16 v0, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
index b0e1da3b8eecb..b5933b4e03dc2 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
@@ -7,11 +7,11 @@
define amdgpu_kernel void @ds1align1(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; GCN-LABEL: ds1align1:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_u8 v0, v0
-; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_write_b8 v1, v0
; GCN-NEXT: s_endpgm
@@ -23,12 +23,12 @@ define amdgpu_kernel void @ds1align1(ptr addrspace(3) %in, ptr addrspace(3) %out
define amdgpu_kernel void @ds2align1(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; ALIGNED-SDAG-LABEL: ds2align1:
; ALIGNED-SDAG: ; %bb.0:
-; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0
; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:1
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s1
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s3
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
; ALIGNED-SDAG-NEXT: ds_write_b8 v2, v1
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
@@ -37,12 +37,12 @@ define amdgpu_kernel void @ds2align1(ptr addrspace(3) %in, ptr addrspace(3) %out
;
; ALIGNED-GISEL-LABEL: ds2align1:
; ALIGNED-GISEL: ; %bb.0:
-; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0
; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:1
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s1
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s3
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 8, v1
; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v1, 8, v0
@@ -52,11 +52,11 @@ define amdgpu_kernel void @ds2align1(ptr addrspace(3) %in, ptr addrspace(3) %out
;
; UNALIGNED-LABEL: ds2align1:
; UNALIGNED: ; %bb.0:
-; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-NEXT: v_mov_b32_e32 v0, s2
; UNALIGNED-NEXT: ds_read_u16 v0, v0
-; UNALIGNED-NEXT: v_mov_b32_e32 v1, s1
+; UNALIGNED-NEXT: v_mov_b32_e32 v1, s3
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-NEXT: ds_write_b16 v1, v0
; UNALIGNED-NEXT: s_endpgm
@@ -68,11 +68,11 @@ define amdgpu_kernel void @ds2align1(ptr addrspace(3) %in, ptr addrspace(3) %out
define amdgpu_kernel void @ds2align2(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; GCN-LABEL: ds2align2:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_u16 v0, v0
-; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_write_b16 v1, v0
; GCN-NEXT: s_endpgm
@@ -84,14 +84,14 @@ define amdgpu_kernel void @ds2align2(ptr addrspace(3) %in, ptr addrspace(3) %out
define amdgpu_kernel void @ds4align1(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; ALIGNED-SDAG-LABEL: ds4align1:
; ALIGNED-SDAG: ; %bb.0:
-; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0
; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1
; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2
; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:3
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s3
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
; ALIGNED-SDAG-NEXT: ds_write_b8 v4, v1
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
@@ -104,15 +104,15 @@ define amdgpu_kernel void @ds4align1(ptr addrspace(3) %in, ptr addrspace(3) %out
;
; ALIGNED-GISEL-LABEL: ds4align1:
; ALIGNED-GISEL: ; %bb.0:
-; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, 8
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0
; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1
; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:3
; ALIGNED-GISEL-NEXT: ds_read_u8 v0, v0 offset:2
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v5, s1
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v5, s3
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2)
; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1)
@@ -130,11 +130,11 @@ define amdgpu_kernel void @ds4align1(ptr addrspace(3) %in, ptr addrspace(3) %out
;
; UNALIGNED-LABEL: ds4align1:
; UNALIGNED: ; %bb.0:
-; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-NEXT: v_mov_b32_e32 v0, s2
; UNALIGNED-NEXT: ds_read_b32 v0, v0
-; UNALIGNED-NEXT: v_mov_b32_e32 v1, s1
+; UNALIGNED-NEXT: v_mov_b32_e32 v1, s3
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-NEXT: ds_write_b32 v1, v0
; UNALIGNED-NEXT: s_endpgm
@@ -146,12 +146,12 @@ define amdgpu_kernel void @ds4align1(ptr addrspace(3) %in, ptr addrspace(3) %out
define amdgpu_kernel void @ds4align2(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; ALIGNED-SDAG-LABEL: ds4align2:
; ALIGNED-SDAG: ; %bb.0:
-; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0
; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:2
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s1
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s3
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
; ALIGNED-SDAG-NEXT: ds_write_b16 v2, v1
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
@@ -160,12 +160,12 @@ define amdgpu_kernel void @ds4align2(ptr addrspace(3) %in, ptr addrspace(3) %out
;
; ALIGNED-GISEL-LABEL: ds4align2:
; ALIGNED-GISEL: ; %bb.0:
-; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0
; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:2
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s1
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s3
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; ALIGNED-GISEL-NEXT: ds_write_b16 v2, v0
@@ -174,11 +174,11 @@ define amdgpu_kernel void @ds4align2(ptr addrspace(3) %in, ptr addrspace(3) %out
;
; UNALIGNED-LABEL: ds4align2:
; UNALIGNED: ; %bb.0:
-; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-NEXT: v_mov_b32_e32 v0, s2
; UNALIGNED-NEXT: ds_read_b32 v0, v0
-; UNALIGNED-NEXT: v_mov_b32_e32 v1, s1
+; UNALIGNED-NEXT: v_mov_b32_e32 v1, s3
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-NEXT: ds_write_b32 v1, v0
; UNALIGNED-NEXT: s_endpgm
@@ -190,11 +190,11 @@ define amdgpu_kernel void @ds4align2(ptr addrspace(3) %in, ptr addrspace(3) %out
define amdgpu_kernel void @ds4align4(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; GCN-LABEL: ds4align4:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v0, v0
-; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_write_b32 v1, v0
; GCN-NEXT: s_endpgm
@@ -206,9 +206,9 @@ define amdgpu_kernel void @ds4align4(ptr addrspace(3) %in, ptr addrspace(3) %out
define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; ALIGNED-SDAG-LABEL: ds8align1:
; ALIGNED-SDAG: ; %bb.0:
-; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0
; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1
; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2
@@ -217,7 +217,7 @@ define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out
; ALIGNED-SDAG-NEXT: ds_read_u8 v6, v0 offset:5
; ALIGNED-SDAG-NEXT: ds_read_u8 v8, v0 offset:6
; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:7
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v7, s1
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v7, s3
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
; ALIGNED-SDAG-NEXT: ds_write_b8 v7, v5 offset:4
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
@@ -234,9 +234,9 @@ define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out
;
; ALIGNED-GISEL-LABEL: ds8align1:
; ALIGNED-GISEL: ; %bb.0:
-; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0
; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1
; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:2
@@ -258,7 +258,7 @@ define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out
; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v7
; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v3, v2
; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v2, 8, v1
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s3
; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v1
; ALIGNED-GISEL-NEXT: ds_write_b8 v3, v2 offset:1
; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, 8
@@ -275,11 +275,11 @@ define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out
;
; UNALIGNED-LABEL: ds8align1:
; UNALIGNED: ; %bb.0:
-; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-NEXT: v_mov_b32_e32 v0, s2
; UNALIGNED-NEXT: ds_read_b64 v[0:1], v0
-; UNALIGNED-NEXT: v_mov_b32_e32 v2, s1
+; UNALIGNED-NEXT: v_mov_b32_e32 v2, s3
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-NEXT: ds_write_b64 v2, v[0:1]
; UNALIGNED-NEXT: s_endpgm
@@ -291,14 +291,14 @@ define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out
define amdgpu_kernel void @ds8align2(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; ALIGNED-SDAG-LABEL: ds8align2:
; ALIGNED-SDAG: ; %bb.0:
-; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:4
; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0
; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2
; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:6
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s3
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
; ALIGNED-SDAG-NEXT: ds_write_b16 v4, v1 offset:4
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
@@ -311,14 +311,14 @@ define amdgpu_kernel void @ds8align2(ptr addrspace(3) %in, ptr addrspace(3) %out
;
; ALIGNED-GISEL-LABEL: ds8align2:
; ALIGNED-GISEL: ; %bb.0:
-; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0
; ALIGNED-GISEL-NEXT: ds_read_u16 v2, v0 offset:2
; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:4
; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:6
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s3
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2)
; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -331,11 +331,11 @@ define amdgpu_kernel void @ds8align2(ptr addrspace(3) %in, ptr addrspace(3) %out
;
; UNALIGNED-LABEL: ds8align2:
; UNALIGNED: ; %bb.0:
-; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-NEXT: v_mov_b32_e32 v0, s2
; UNALIGNED-NEXT: ds_read_b64 v[0:1], v0
-; UNALIGNED-NEXT: v_mov_b32_e32 v2, s1
+; UNALIGNED-NEXT: v_mov_b32_e32 v2, s3
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-NEXT: ds_write_b64 v2, v[0:1]
; UNALIGNED-NEXT: s_endpgm
@@ -347,11 +347,11 @@ define amdgpu_kernel void @ds8align2(ptr addrspace(3) %in, ptr addrspace(3) %out
define amdgpu_kernel void @ds8align4(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; GCN-LABEL: ds8align4:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
-; GCN-NEXT: v_mov_b32_e32 v2, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_write2_b32 v2, v0, v1 offset1:1
; GCN-NEXT: s_endpgm
@@ -363,11 +363,11 @@ define amdgpu_kernel void @ds8align4(ptr addrspace(3) %in, ptr addrspace(3) %out
define amdgpu_kernel void @ds8align8(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; GCN-LABEL: ds8align8:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b64 v[0:1], v0
-; GCN-NEXT: v_mov_b32_e32 v2, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_write_b64 v2, v[0:1]
; GCN-NEXT: s_endpgm
@@ -379,9 +379,9 @@ define amdgpu_kernel void @ds8align8(ptr addrspace(3) %in, ptr addrspace(3) %out
define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; ALIGNED-SDAG-LABEL: ds12align1:
; ALIGNED-SDAG: ; %bb.0:
-; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0
; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1
; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2
@@ -394,7 +394,7 @@ define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %ou
; ALIGNED-SDAG-NEXT: ds_read_u8 v10, v0 offset:9
; ALIGNED-SDAG-NEXT: ds_read_u8 v11, v0 offset:10
; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:11
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v12, s1
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v12, s3
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
; ALIGNED-SDAG-NEXT: ds_write_b8 v12, v9 offset:8
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
@@ -415,9 +415,9 @@ define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; ALIGNED-GISEL-LABEL: ds12align1:
; ALIGNED-GISEL: ; %bb.0:
-; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0
; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1
; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:2
@@ -449,7 +449,7 @@ define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %ou
; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v0, 24, v0
; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v4, v3
; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v3, 8, v1
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s3
; ALIGNED-GISEL-NEXT: v_or3_b32 v2, v6, v7, v2
; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v1
; ALIGNED-GISEL-NEXT: ds_write_b8 v4, v3 offset:1
@@ -473,11 +473,11 @@ define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; UNALIGNED-LABEL: ds12align1:
; UNALIGNED: ; %bb.0:
-; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-NEXT: v_mov_b32_e32 v0, s2
; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0
-; UNALIGNED-NEXT: v_mov_b32_e32 v3, s1
+; UNALIGNED-NEXT: v_mov_b32_e32 v3, s3
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-NEXT: ds_write_b96 v3, v[0:2]
; UNALIGNED-NEXT: s_endpgm
@@ -489,15 +489,15 @@ define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %ou
define amdgpu_kernel void @ds12align2(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; ALIGNED-SDAG-LABEL: ds12align2:
; ALIGNED-SDAG: ; %bb.0:
-; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:8
; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0
; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2
; ALIGNED-SDAG-NEXT: ds_read_u16 v4, v0 offset:4
; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:6
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v6, s1
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v6, s3
; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:10
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(5)
; ALIGNED-SDAG-NEXT: ds_write_b16 v6, v1 offset:8
@@ -513,16 +513,16 @@ define amdgpu_kernel void @ds12align2(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; ALIGNED-GISEL-LABEL: ds12align2:
; ALIGNED-GISEL: ; %bb.0:
-; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0
; ALIGNED-GISEL-NEXT: ds_read_u16 v2, v0 offset:2
; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:4
; ALIGNED-GISEL-NEXT: ds_read_u16 v4, v0 offset:6
; ALIGNED-GISEL-NEXT: ds_read_u16 v5, v0 offset:8
; ALIGNED-GISEL-NEXT: ds_read_u16 v0, v0 offset:10
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v6, s1
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v6, s3
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4)
; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2)
@@ -539,11 +539,11 @@ define amdgpu_kernel void @ds12align2(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; UNALIGNED-LABEL: ds12align2:
; UNALIGNED: ; %bb.0:
-; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-NEXT: v_mov_b32_e32 v0, s2
; UNALIGNED-NEXT: ds_read_b96 v[0:2], v0
-; UNALIGNED-NEXT: v_mov_b32_e32 v3, s1
+; UNALIGNED-NEXT: v_mov_b32_e32 v3, s3
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-NEXT: ds_write_b96 v3, v[0:2]
; UNALIGNED-NEXT: s_endpgm
@@ -555,12 +555,12 @@ define amdgpu_kernel void @ds12align2(ptr addrspace(3) %in, ptr addrspace(3) %ou
define amdgpu_kernel void @ds12align4(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; ALIGNED-LABEL: ds12align4:
; ALIGNED: ; %bb.0:
-; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-NEXT: v_mov_b32_e32 v2, s0
+; ALIGNED-NEXT: v_mov_b32_e32 v2, s2
; ALIGNED-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
; ALIGNED-NEXT: ds_read_b32 v2, v2 offset:8
-; ALIGNED-NEXT: v_mov_b32_e32 v3, s1
+; ALIGNED-NEXT: v_mov_b32_e32 v3, s3
; ALIGNED-NEXT: s_waitcnt lgkmcnt(1)
; ALIGNED-NEXT: ds_write2_b32 v3, v0, v1 offset1:1
; ALIGNED-NEXT: s_waitcnt lgkmcnt(1)
@@ -569,12 +569,12 @@ define amdgpu_kernel void @ds12align4(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; UNALIGNED-SDAG-LABEL: ds12align4:
; UNALIGNED-SDAG: ; %bb.0:
-; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s2
; UNALIGNED-SDAG-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
; UNALIGNED-SDAG-NEXT: ds_read_b32 v2, v2 offset:8
-; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v3, s3
; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
; UNALIGNED-SDAG-NEXT: ds_write2_b32 v3, v0, v1 offset1:1
; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
@@ -583,11 +583,11 @@ define amdgpu_kernel void @ds12align4(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; UNALIGNED-GISEL-LABEL: ds12align4:
; UNALIGNED-GISEL: ; %bb.0:
-; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
; UNALIGNED-GISEL-NEXT: ds_read_b96 v[0:2], v0
-; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s3
; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-GISEL-NEXT: ds_write_b96 v3, v[0:2]
; UNALIGNED-GISEL-NEXT: s_endpgm
@@ -599,12 +599,12 @@ define amdgpu_kernel void @ds12align4(ptr addrspace(3) %in, ptr addrspace(3) %ou
define amdgpu_kernel void @ds12align8(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; ALIGNED-SDAG-LABEL: ds12align8:
; ALIGNED-SDAG: ; %bb.0:
-; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s2
; ALIGNED-SDAG-NEXT: ds_read_b64 v[0:1], v2
; ALIGNED-SDAG-NEXT: ds_read_b32 v2, v2 offset:8
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v3, s3
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
; ALIGNED-SDAG-NEXT: ds_write_b64 v3, v[0:1]
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
@@ -613,12 +613,12 @@ define amdgpu_kernel void @ds12align8(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; ALIGNED-GISEL-LABEL: ds12align8:
; ALIGNED-GISEL: ; %bb.0:
-; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v2, s2
; ALIGNED-GISEL-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
; ALIGNED-GISEL-NEXT: ds_read_b32 v2, v2 offset:8
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s3
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1)
; ALIGNED-GISEL-NEXT: ds_write2_b32 v3, v0, v1 offset1:1
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(1)
@@ -627,12 +627,12 @@ define amdgpu_kernel void @ds12align8(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; UNALIGNED-SDAG-LABEL: ds12align8:
; UNALIGNED-SDAG: ; %bb.0:
-; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s2
; UNALIGNED-SDAG-NEXT: ds_read_b32 v2, v0 offset:8
; UNALIGNED-SDAG-NEXT: ds_read_b64 v[0:1], v0
-; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v3, s3
; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
; UNALIGNED-SDAG-NEXT: ds_write_b32 v3, v2 offset:8
; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
@@ -641,11 +641,11 @@ define amdgpu_kernel void @ds12align8(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; UNALIGNED-GISEL-LABEL: ds12align8:
; UNALIGNED-GISEL: ; %bb.0:
-; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
; UNALIGNED-GISEL-NEXT: ds_read_b96 v[0:2], v0
-; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v3, s3
; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-GISEL-NEXT: ds_write_b96 v3, v[0:2]
; UNALIGNED-GISEL-NEXT: s_endpgm
@@ -657,11 +657,11 @@ define amdgpu_kernel void @ds12align8(ptr addrspace(3) %in, ptr addrspace(3) %ou
define amdgpu_kernel void @ds12align16(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; GCN-LABEL: ds12align16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b96 v[0:2], v0
-; GCN-NEXT: v_mov_b32_e32 v3, s1
+; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_write_b96 v3, v[0:2]
; GCN-NEXT: s_endpgm
@@ -673,9 +673,9 @@ define amdgpu_kernel void @ds12align16(ptr addrspace(3) %in, ptr addrspace(3) %o
define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; ALIGNED-SDAG-LABEL: ds16align1:
; ALIGNED-SDAG: ; %bb.0:
-; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-SDAG-NEXT: ds_read_u8 v1, v0
; ALIGNED-SDAG-NEXT: ds_read_u8 v2, v0 offset:1
; ALIGNED-SDAG-NEXT: ds_read_u8 v3, v0 offset:2
@@ -692,7 +692,7 @@ define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %ou
; ALIGNED-SDAG-NEXT: ds_read_u8 v14, v0 offset:13
; ALIGNED-SDAG-NEXT: ds_read_u8 v15, v0 offset:14
; ALIGNED-SDAG-NEXT: ds_read_u8 v0, v0 offset:15
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v16, s1
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v16, s3
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
; ALIGNED-SDAG-NEXT: ds_write_b8 v16, v13 offset:12
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(3)
@@ -716,9 +716,9 @@ define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; ALIGNED-GISEL-LABEL: ds16align1:
; ALIGNED-GISEL: ; %bb.0:
-; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-GISEL-NEXT: ds_read_u8 v1, v0
; ALIGNED-GISEL-NEXT: ds_read_u8 v2, v0 offset:1
; ALIGNED-GISEL-NEXT: ds_read_u8 v3, v0 offset:2
@@ -760,7 +760,7 @@ define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %ou
; ALIGNED-GISEL-NEXT: v_lshlrev_b32_e32 v5, 16, v9
; ALIGNED-GISEL-NEXT: v_or3_b32 v0, v0, v5, v4
; ALIGNED-GISEL-NEXT: v_lshrrev_b16_e32 v4, 8, v1
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v5, s1
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v5, s3
; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v1
; ALIGNED-GISEL-NEXT: ds_write_b8 v5, v4 offset:1
; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, 8
@@ -789,11 +789,11 @@ define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; UNALIGNED-LABEL: ds16align1:
; UNALIGNED: ; %bb.0:
-; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-NEXT: v_mov_b32_e32 v0, s2
; UNALIGNED-NEXT: ds_read_b128 v[0:3], v0
-; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1
+; UNALIGNED-NEXT: v_mov_b32_e32 v4, s3
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-NEXT: ds_write_b128 v4, v[0:3]
; UNALIGNED-NEXT: s_endpgm
@@ -805,9 +805,9 @@ define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %ou
define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; ALIGNED-SDAG-LABEL: ds16align2:
; ALIGNED-SDAG: ; %bb.0:
-; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-SDAG-NEXT: ds_read_u16 v1, v0 offset:12
; ALIGNED-SDAG-NEXT: ds_read_u16 v2, v0
; ALIGNED-SDAG-NEXT: ds_read_u16 v3, v0 offset:2
@@ -815,7 +815,7 @@ define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %ou
; ALIGNED-SDAG-NEXT: ds_read_u16 v5, v0 offset:6
; ALIGNED-SDAG-NEXT: ds_read_u16 v6, v0 offset:8
; ALIGNED-SDAG-NEXT: ds_read_u16 v7, v0 offset:10
-; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v8, s1
+; ALIGNED-SDAG-NEXT: v_mov_b32_e32 v8, s3
; ALIGNED-SDAG-NEXT: ds_read_u16 v0, v0 offset:14
; ALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(7)
; ALIGNED-SDAG-NEXT: ds_write_b16 v8, v1 offset:12
@@ -835,9 +835,9 @@ define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; ALIGNED-GISEL-LABEL: ds16align2:
; ALIGNED-GISEL: ; %bb.0:
-; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
; ALIGNED-GISEL-NEXT: ds_read_u16 v1, v0
; ALIGNED-GISEL-NEXT: ds_read_u16 v2, v0 offset:2
; ALIGNED-GISEL-NEXT: ds_read_u16 v3, v0 offset:4
@@ -850,7 +850,7 @@ define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %ou
; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(4)
; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v2, v4, 16, v3
-; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1
+; ALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s3
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(2)
; ALIGNED-GISEL-NEXT: v_lshl_or_b32 v3, v6, 16, v5
; ALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
@@ -867,11 +867,11 @@ define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; UNALIGNED-LABEL: ds16align2:
; UNALIGNED: ; %bb.0:
-; UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-NEXT: v_mov_b32_e32 v0, s2
; UNALIGNED-NEXT: ds_read_b128 v[0:3], v0
-; UNALIGNED-NEXT: v_mov_b32_e32 v4, s1
+; UNALIGNED-NEXT: v_mov_b32_e32 v4, s3
; UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-NEXT: ds_write_b128 v4, v[0:3]
; UNALIGNED-NEXT: s_endpgm
@@ -883,12 +883,12 @@ define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %ou
define amdgpu_kernel void @ds16align4(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; ALIGNED-LABEL: ds16align4:
; ALIGNED: ; %bb.0:
-; ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; ALIGNED-NEXT: v_mov_b32_e32 v2, s0
+; ALIGNED-NEXT: v_mov_b32_e32 v2, s2
; ALIGNED-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
; ALIGNED-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
-; ALIGNED-NEXT: v_mov_b32_e32 v4, s1
+; ALIGNED-NEXT: v_mov_b32_e32 v4, s3
; ALIGNED-NEXT: s_waitcnt lgkmcnt(1)
; ALIGNED-NEXT: ds_write2_b32 v4, v0, v1 offset1:1
; ALIGNED-NEXT: s_waitcnt lgkmcnt(1)
@@ -897,12 +897,12 @@ define amdgpu_kernel void @ds16align4(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; UNALIGNED-SDAG-LABEL: ds16align4:
; UNALIGNED-SDAG: ; %bb.0:
-; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v2, s2
; UNALIGNED-SDAG-NEXT: ds_read2_b32 v[0:1], v2 offset0:2 offset1:3
; UNALIGNED-SDAG-NEXT: ds_read2_b32 v[2:3], v2 offset1:1
-; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s1
+; UNALIGNED-SDAG-NEXT: v_mov_b32_e32 v4, s3
; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
; UNALIGNED-SDAG-NEXT: ds_write2_b32 v4, v0, v1 offset0:2 offset1:3
; UNALIGNED-SDAG-NEXT: s_waitcnt lgkmcnt(1)
@@ -911,11 +911,11 @@ define amdgpu_kernel void @ds16align4(ptr addrspace(3) %in, ptr addrspace(3) %ou
;
; UNALIGNED-GISEL-LABEL: ds16align4:
; UNALIGNED-GISEL: ; %bb.0:
-; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; UNALIGNED-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v0, s2
; UNALIGNED-GISEL-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
-; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s1
+; UNALIGNED-GISEL-NEXT: v_mov_b32_e32 v4, s3
; UNALIGNED-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; UNALIGNED-GISEL-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
; UNALIGNED-GISEL-NEXT: s_endpgm
@@ -927,11 +927,11 @@ define amdgpu_kernel void @ds16align4(ptr addrspace(3) %in, ptr addrspace(3) %ou
define amdgpu_kernel void @ds16align8(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; GCN-LABEL: ds16align8:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
-; GCN-NEXT: v_mov_b32_e32 v4, s1
+; GCN-NEXT: v_mov_b32_e32 v4, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
; GCN-NEXT: s_endpgm
@@ -943,11 +943,11 @@ define amdgpu_kernel void @ds16align8(ptr addrspace(3) %in, ptr addrspace(3) %ou
define amdgpu_kernel void @ds16align16(ptr addrspace(3) %in, ptr addrspace(3) %out) {
; GCN-LABEL: ds16align16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b128 v[0:3], v0
-; GCN-NEXT: v_mov_b32_e32 v4, s1
+; GCN-NEXT: v_mov_b32_e32 v4, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_write_b128 v4, v[0:3]
; GCN-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll b/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll
index 5814b8a8ceda4..4cd583590450b 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll
@@ -36,7 +36,7 @@ define amdgpu_kernel void @ds_combine_nodep(ptr addrspace(1) %out, ptr addrspace
; GCN-LABEL: {{^}}ds_combine_WAR
; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:7 offset1:27
-; GCN-NEXT: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27
+; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27
define amdgpu_kernel void @ds_combine_WAR(ptr addrspace(1) %out, ptr addrspace(3) %inptr) {
%addr0 = getelementptr i8, ptr addrspace(3) %inptr, i32 100
diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
index 06908d21e5355..ee374bd2bae34 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
@@ -105,12 +105,12 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(ptr addrspace(1)
;
; GFX9-LABEL: simple_write2_two_val_f32_volatile_0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x8
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write_b32 v0, v1
; GFX9-NEXT: ds_write_b32 v0, v2 offset:32
@@ -151,12 +151,12 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(ptr addrspace(1)
;
; GFX9-LABEL: simple_write2_two_val_f32_volatile_1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x8
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write_b32 v0, v1
; GFX9-NEXT: ds_write_b32 v0, v2 offset:32
@@ -368,11 +368,11 @@ define amdgpu_kernel void @simple_write2_two_val_too_far_f32(ptr addrspace(1) %C
;
; GFX9-LABEL: simple_write2_two_val_too_far_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x8
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
+; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: ds_write_b32 v0, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -413,11 +413,11 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2(ptr addrspace(1) %C, ptr
;
; GFX9-LABEL: simple_write2_two_val_f32_x2:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x8
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
+; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:8
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:11 offset1:27
@@ -469,11 +469,11 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(ptr addrspa
;
; GFX9-LABEL: simple_write2_two_val_f32_x2_nonzero_base:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x8
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x8
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NEXT: global_load_dword v2, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
+; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:3 offset1:8
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:11 offset1:27
@@ -963,11 +963,11 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(ptr addrspace(3)
;
; GFX9-ALIGNED-LABEL: simple_write2_v4f32_superreg_align4:
; GFX9-ALIGNED: ; %bb.0:
-; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX9-ALIGNED-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX9-ALIGNED-NEXT: s_load_dword s6, s[0:1], 0x0
; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-ALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s4
-; GFX9-ALIGNED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GFX9-ALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s6
+; GFX9-ALIGNED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, s0
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, s1
@@ -979,11 +979,11 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(ptr addrspace(3)
;
; GFX9-UNALIGNED-LABEL: simple_write2_v4f32_superreg_align4:
; GFX9-UNALIGNED: ; %bb.0:
-; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x8
-; GFX9-UNALIGNED-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
+; GFX9-UNALIGNED-NEXT: s_load_dword s6, s[0:1], 0x0
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-UNALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s4
-; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0
+; GFX9-UNALIGNED-NEXT: v_lshl_add_u32 v0, v0, 4, s6
+; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s2
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, s3
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
index db3ea4df52981..e16bb28d4b2bc 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
@@ -411,15 +411,15 @@ entry:
; GCN-LABEL: {{^}}bit4_extelt:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshl_b32 s2, s2, 3
-; GCN-NEXT: s_lshr_b32 s2, 0x1000100, s2
-; GCN-NEXT: s_and_b32 s2, s2, 1
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: s_lshl_b32 s0, s4, 3
+; GCN-NEXT: s_lshr_b32 s0, 0x1000100, s0
+; GCN-NEXT: s_and_b32 s0, s0, 1
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: flat_store_dword v[0:1], v2
define amdgpu_kernel void @bit4_extelt(ptr addrspace(1) %out, i32 %sel) {
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
index 44d65c9e50086..6823dcf5dc2df 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
@@ -21,32 +21,32 @@ define amdgpu_kernel void @extract_vector_elt_v2f16(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: extract_vector_elt_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s4, s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s5, s4, 16
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_short v1, off, s[0:3], 0
-; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:20
+; VI-NEXT: s_lshr_b32 s1, s0, 16
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_short v1, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:20
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: extract_vector_elt_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: v_mov_b32_e32 v1, s0
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_d16_hi_b16 v0, v1, s[0:1]
-; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 offset:20
+; GFX11-NEXT: global_store_d16_hi_b16 v0, v1, s[4:5]
+; GFX11-NEXT: buffer_store_b16 v1, off, s[4:7], 0 offset:20
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -140,6 +140,7 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(ptr addrspace(1
; VI-LABEL: extract_vector_elt_v2f16_dynamic_vgpr:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
@@ -147,35 +148,33 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(ptr addrspace(1
; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dword v2, v[1:2]
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_load_dword s1, s[2:3], 0x0
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v2
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_lshrrev_b32_e64 v2, v2, s1
+; VI-NEXT: v_lshrrev_b32_e64 v2, v2, s0
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: extract_vector_elt_v2f16_dynamic_vgpr:
; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v1, s[2:3]
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 4, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e64 v1, v1, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: v_lshrrev_b32_e64 v1, v1, s0
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -317,18 +316,18 @@ define amdgpu_kernel void @v_extractelement_v4f16_2(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: v_extractelement_v4f16_2:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s3
-; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
+; VI-NEXT: v_mov_b32_e32 v2, s7
+; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: v_add_u32_e32 v1, vcc, 4, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dword v2, v[1:2]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_short v[0:1], v2
@@ -336,13 +335,13 @@ define amdgpu_kernel void @v_extractelement_v4f16_2(ptr addrspace(1) %out, ptr a
;
; GFX11-LABEL: v_extractelement_v4f16_2:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v1, s[2:3] offset:4
+; GFX11-NEXT: global_load_b32 v1, v1, s[6:7] offset:4
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -380,43 +379,42 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(ptr addrspace(1) %
;
; VI-LABEL: v_insertelement_v4f16_dynamic_vgpr:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
-; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s3
-; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: buffer_load_dword v3, off, s[0:3], 0 glc
-; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v2, s7
+; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
+; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: v_lshlrev_b32_e32 v0, 4, v3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b64 v[0:1], v0, v[1:2]
-; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v4
+; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc
; VI-NEXT: flat_store_short v[1:2], v0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: v_insertelement_v4f16_dynamic_vgpr:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s6, -1
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0
-; GFX11-NEXT: buffer_load_b32 v3, off, s[4:7], 0 glc dlc
+; GFX11-NEXT: buffer_load_b32 v3, off, s[0:3], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3]
+; GFX11-NEXT: global_load_b64 v[1:2], v1, s[6:7]
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 4, v3
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshrrev_b64 v[1:2], v3, v[1:2]
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -451,12 +449,12 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(ptr addrspace(4)
;
; VI-LABEL: reduce_load_vector_v8f16_extract_01:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s1, s0, 16
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
@@ -468,12 +466,12 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(ptr addrspace(4)
;
; GFX11-LABEL: reduce_load_vector_v8f16_extract_01:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_lshr_b32 s1, s0, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
@@ -512,12 +510,12 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(ptr addrspace(4)
;
; VI-LABEL: reduce_load_vector_v8f16_extract_23:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_load_dword s0, s[2:3], 0x4
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s0, s[0:1], 0x4
-; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s1, s0, 16
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
@@ -529,12 +527,12 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(ptr addrspace(4)
;
; GFX11-LABEL: reduce_load_vector_v8f16_extract_23:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x4
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x4
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_lshr_b32 s1, s0, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll
index 8f0d6393641a2..b24345079dadb 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.ll
@@ -149,19 +149,19 @@ define amdgpu_kernel void @fabsf_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
;
; VI-LABEL: fabsf_v4f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: s_bitset0_b32 s3, 31
-; VI-NEXT: s_bitset0_b32 s2, 31
-; VI-NEXT: s_bitset0_b32 s1, 31
-; VI-NEXT: s_bitset0_b32 s0, 31
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: s_and_b32 s0, s7, 0x7fffffff
+; VI-NEXT: s_and_b32 s1, s6, 0x7fffffff
+; VI-NEXT: s_bitset0_b32 s5, 31
+; VI-NEXT: s_bitset0_b32 s4, 31
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
%fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
diff --git a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
index cdc6b5a48d0a6..00d77ded017c9 100644
--- a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
@@ -36,50 +36,50 @@ define amdgpu_kernel void @fadd_f16(
; VI-LABEL: fadd_f16:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s2, s10
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s4
-; VI-NEXT: s_mov_b32 s9, s5
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: s_mov_b32 s3, s11
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 glc
+; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f16_e32 v0, v0, v1
-; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fadd_f16:
; GFX11-SDAG: ; %bb.0: ; %entry
; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-SDAG-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s10, -1
-; GFX11-SDAG-NEXT: s_mov_b32 s3, s11
-; GFX11-SDAG-NEXT: s_mov_b32 s2, s10
+; GFX11-SDAG-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_mov_b32 s11, s3
+; GFX11-SDAG-NEXT: s_mov_b32 s10, s2
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s8, s4
-; GFX11-SDAG-NEXT: s_mov_b32 s9, s5
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-SDAG-NEXT: s_mov_b32 s4, s6
; GFX11-SDAG-NEXT: s_mov_b32 s5, s7
-; GFX11-SDAG-NEXT: s_mov_b32 s6, s10
-; GFX11-SDAG-NEXT: s_mov_b32 s7, s11
+; GFX11-SDAG-NEXT: s_mov_b32 s6, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s7, s3
; GFX11-SDAG-NEXT: buffer_load_u16 v0, off, s[4:7], 0 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-SDAG-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_mov_b16_e32 v0.h, v1.l
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
-; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
@@ -88,17 +88,17 @@ define amdgpu_kernel void @fadd_f16(
; GFX11-GISEL: ; %bb.0: ; %entry
; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-GISEL-NEXT: s_mov_b32 s10, -1
-; GFX11-GISEL-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-GISEL-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
+; GFX11-GISEL-NEXT: s_mov_b64 s[10:11], s[2:3]
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX11-GISEL-NEXT: s_mov_b64 s[6:7], s[10:11]
-; GFX11-GISEL-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
+; GFX11-GISEL-NEXT: s_mov_b64 s[0:1], s[6:7]
+; GFX11-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX11-GISEL-NEXT: buffer_load_u16 v0, off, s[0:3], 0 glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-GISEL-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_mov_b16_e32 v0.h, v1.l
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -112,24 +112,24 @@ define amdgpu_kernel void @fadd_f16(
; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry
; GFX11-FAKE16-SDAG-NEXT: s_clause 0x1
; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FAKE16-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s10, -1
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s3, s11
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s2, s10
+; GFX11-FAKE16-SDAG-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s11, s3
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s10, s2
; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s8, s4
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s9, s5
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s4, s6
; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s5, s7
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, s10
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, s11
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, s2
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, s3
; GFX11-FAKE16-SDAG-NEXT: buffer_load_u16 v0, off, s[4:7], 0 glc dlc
; GFX11-FAKE16-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-SDAG-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-FAKE16-SDAG-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-FAKE16-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-SDAG-NEXT: v_add_f16_e32 v0, v0, v1
-; GFX11-FAKE16-SDAG-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-FAKE16-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-FAKE16-SDAG-NEXT: s_nop 0
; GFX11-FAKE16-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-SDAG-NEXT: s_endpgm
@@ -138,17 +138,17 @@ define amdgpu_kernel void @fadd_f16(
; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry
; GFX11-FAKE16-GISEL-NEXT: s_clause 0x1
; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FAKE16-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s10, -1
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-FAKE16-GISEL-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-FAKE16-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[2:3], s[10:11]
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[10:11], s[2:3]
; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[6:7], s[10:11]
-; GFX11-FAKE16-GISEL-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[0:1], s[6:7]
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX11-FAKE16-GISEL-NEXT: buffer_load_u16 v0, off, s[0:3], 0 glc dlc
; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FAKE16-GISEL-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-FAKE16-GISEL-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-GISEL-NEXT: v_add_f16_e32 v0, v0, v1
; GFX11-FAKE16-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0
@@ -216,94 +216,94 @@ define amdgpu_kernel void @fadd_f16_imm_a(
;
; VI-LABEL: fadd_f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: s_mov_b32 s0, s2
-; VI-NEXT: s_mov_b32 s1, s3
-; VI-NEXT: s_mov_b32 s2, s6
-; VI-NEXT: s_mov_b32 s3, s7
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s4, s6
+; VI-NEXT: s_mov_b32 s5, s7
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f16_e32 v0, 1.0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fadd_f16_imm_a:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
-; GFX11-SDAG-NEXT: s_mov_b32 s0, s2
-; GFX11-SDAG-NEXT: s_mov_b32 s1, s3
-; GFX11-SDAG-NEXT: s_mov_b32 s2, s6
-; GFX11-SDAG-NEXT: s_mov_b32 s3, s7
-; GFX11-SDAG-NEXT: buffer_load_u16 v0, off, s[0:3], 0
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
+; GFX11-SDAG-NEXT: s_mov_b32 s4, s6
+; GFX11-SDAG-NEXT: s_mov_b32 s5, s7
+; GFX11-SDAG-NEXT: s_mov_b32 s6, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s7, s3
+; GFX11-SDAG-NEXT: buffer_load_u16 v0, off, s[4:7], 0
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_mov_b16_e32 v0.h, 0x3c00
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
-; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fadd_f16_imm_a:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
-; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7]
-; GFX11-GISEL-NEXT: buffer_load_u16 v0, off, s[4:7], 0
+; GFX11-GISEL-NEXT: s_mov_b64 s[0:1], s[6:7]
+; GFX11-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX11-GISEL-NEXT: buffer_load_u16 v0, off, s[0:3], 0
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_mov_b16_e32 v0.h, 0x3c00
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
-; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX11-FAKE16-SDAG-LABEL: fadd_f16_imm_a:
; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry
-; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s2, -1
; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s5, s1
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s0, s2
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s1, s3
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s2, s6
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s3, s7
-; GFX11-FAKE16-SDAG-NEXT: buffer_load_u16 v0, off, s[0:3], 0
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s1, s5
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s4, s6
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s5, s7
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, s2
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, s3
+; GFX11-FAKE16-SDAG-NEXT: buffer_load_u16 v0, off, s[4:7], 0
; GFX11-FAKE16-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-SDAG-NEXT: v_add_f16_e32 v0, 1.0, v0
-; GFX11-FAKE16-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-FAKE16-SDAG-NEXT: s_nop 0
; GFX11-FAKE16-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-SDAG-NEXT: s_endpgm
;
; GFX11-FAKE16-GISEL-LABEL: fadd_f16_imm_a:
; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry
-; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s6, -1
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7]
-; GFX11-FAKE16-GISEL-NEXT: buffer_load_u16 v0, off, s[4:7], 0
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[0:1], s[6:7]
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX11-FAKE16-GISEL-NEXT: buffer_load_u16 v0, off, s[0:3], 0
; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-GISEL-NEXT: v_add_f16_e32 v0, 1.0, v0
-; GFX11-FAKE16-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-FAKE16-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-FAKE16-GISEL-NEXT: s_nop 0
; GFX11-FAKE16-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-GISEL-NEXT: s_endpgm
@@ -360,94 +360,94 @@ define amdgpu_kernel void @fadd_f16_imm_b(
;
; VI-LABEL: fadd_f16_imm_b:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: s_mov_b32 s0, s2
-; VI-NEXT: s_mov_b32 s1, s3
-; VI-NEXT: s_mov_b32 s2, s6
-; VI-NEXT: s_mov_b32 s3, s7
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s4, s6
+; VI-NEXT: s_mov_b32 s5, s7
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f16_e32 v0, 2.0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fadd_f16_imm_b:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
-; GFX11-SDAG-NEXT: s_mov_b32 s0, s2
-; GFX11-SDAG-NEXT: s_mov_b32 s1, s3
-; GFX11-SDAG-NEXT: s_mov_b32 s2, s6
-; GFX11-SDAG-NEXT: s_mov_b32 s3, s7
-; GFX11-SDAG-NEXT: buffer_load_u16 v0, off, s[0:3], 0
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
+; GFX11-SDAG-NEXT: s_mov_b32 s4, s6
+; GFX11-SDAG-NEXT: s_mov_b32 s5, s7
+; GFX11-SDAG-NEXT: s_mov_b32 s6, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s7, s3
+; GFX11-SDAG-NEXT: buffer_load_u16 v0, off, s[4:7], 0
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_mov_b16_e32 v0.h, 0x4000
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
-; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fadd_f16_imm_b:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
-; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7]
-; GFX11-GISEL-NEXT: buffer_load_u16 v0, off, s[4:7], 0
+; GFX11-GISEL-NEXT: s_mov_b64 s[0:1], s[6:7]
+; GFX11-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX11-GISEL-NEXT: buffer_load_u16 v0, off, s[0:3], 0
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_mov_b16_e32 v0.h, 0x4000
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
-; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX11-FAKE16-SDAG-LABEL: fadd_f16_imm_b:
; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry
-; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s2, -1
; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s5, s1
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s0, s2
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s1, s3
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s2, s6
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s3, s7
-; GFX11-FAKE16-SDAG-NEXT: buffer_load_u16 v0, off, s[0:3], 0
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s1, s5
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s4, s6
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s5, s7
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, s2
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, s3
+; GFX11-FAKE16-SDAG-NEXT: buffer_load_u16 v0, off, s[4:7], 0
; GFX11-FAKE16-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-SDAG-NEXT: v_add_f16_e32 v0, 2.0, v0
-; GFX11-FAKE16-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-FAKE16-SDAG-NEXT: s_nop 0
; GFX11-FAKE16-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-SDAG-NEXT: s_endpgm
;
; GFX11-FAKE16-GISEL-LABEL: fadd_f16_imm_b:
; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry
-; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s6, -1
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3]
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7]
-; GFX11-FAKE16-GISEL-NEXT: buffer_load_u16 v0, off, s[4:7], 0
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[0:1], s[6:7]
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
+; GFX11-FAKE16-GISEL-NEXT: buffer_load_u16 v0, off, s[0:3], 0
; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-GISEL-NEXT: v_add_f16_e32 v0, 2.0, v0
-; GFX11-FAKE16-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-FAKE16-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-FAKE16-GISEL-NEXT: s_nop 0
; GFX11-FAKE16-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-GISEL-NEXT: s_endpgm
@@ -566,12 +566,12 @@ define amdgpu_kernel void @fadd_v2f16(
; GFX11-GISEL: ; %bb.0: ; %entry
; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-GISEL-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-GISEL-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -606,12 +606,12 @@ define amdgpu_kernel void @fadd_v2f16(
; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry
; GFX11-FAKE16-GISEL-NEXT: s_clause 0x1
; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FAKE16-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FAKE16-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FAKE16-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FAKE16-GISEL-NEXT: s_clause 0x1
; GFX11-FAKE16-GISEL-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FAKE16-GISEL-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-FAKE16-GISEL-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s6, -1
; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -684,85 +684,85 @@ define amdgpu_kernel void @fadd_v2f16_imm_a(
;
; VI-LABEL: fadd_v2f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: v_mov_b32_e32 v1, 0x4000
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v0, 1.0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fadd_v2f16_imm_a:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v0, v0, s[2:3]
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-NEXT: global_load_b32 v0, v0, s[6:7]
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_pk_add_f16 v0, 0x40003c00, v0
-; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fadd_v2f16_imm_a:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v0, v0, s[2:3]
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: global_load_b32 v0, v0, s[6:7]
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_pk_add_f16 v0, 0x40003c00, v0
-; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX11-FAKE16-SDAG-LABEL: fadd_v2f16_imm_a:
; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry
-; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-FAKE16-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s2, -1
; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-SDAG-NEXT: global_load_b32 v0, v0, s[2:3]
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-SDAG-NEXT: global_load_b32 v0, v0, s[6:7]
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-FAKE16-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-SDAG-NEXT: v_pk_add_f16 v0, 0x40003c00, v0
-; GFX11-FAKE16-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-SDAG-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-FAKE16-SDAG-NEXT: s_nop 0
; GFX11-FAKE16-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-SDAG-NEXT: s_endpgm
;
; GFX11-FAKE16-GISEL-LABEL: fadd_v2f16_imm_a:
; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry
-; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-FAKE16-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-GISEL-NEXT: global_load_b32 v0, v0, s[2:3]
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-GISEL-NEXT: global_load_b32 v0, v0, s[6:7]
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-GISEL-NEXT: v_pk_add_f16 v0, 0x40003c00, v0
-; GFX11-FAKE16-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-FAKE16-GISEL-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-FAKE16-GISEL-NEXT: s_nop 0
; GFX11-FAKE16-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-GISEL-NEXT: s_endpgm
@@ -823,85 +823,85 @@ define amdgpu_kernel void @fadd_v2f16_imm_b(
;
; VI-LABEL: fadd_v2f16_imm_b:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: v_mov_b32_e32 v1, 0x3c00
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v0, 2.0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fadd_v2f16_imm_b:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v0, v0, s[2:3]
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-NEXT: global_load_b32 v0, v0, s[6:7]
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_pk_add_f16 v0, 0x3c004000, v0
-; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fadd_v2f16_imm_b:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v0, v0, s[2:3]
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: global_load_b32 v0, v0, s[6:7]
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_pk_add_f16 v0, 0x3c004000, v0
-; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
;
; GFX11-FAKE16-SDAG-LABEL: fadd_v2f16_imm_b:
; GFX11-FAKE16-SDAG: ; %bb.0: ; %entry
-; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FAKE16-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-FAKE16-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s2, -1
; GFX11-FAKE16-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-SDAG-NEXT: global_load_b32 v0, v0, s[2:3]
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-SDAG-NEXT: global_load_b32 v0, v0, s[6:7]
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX11-FAKE16-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-FAKE16-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-SDAG-NEXT: v_pk_add_f16 v0, 0x3c004000, v0
-; GFX11-FAKE16-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-SDAG-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-FAKE16-SDAG-NEXT: s_nop 0
; GFX11-FAKE16-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-SDAG-NEXT: s_endpgm
;
; GFX11-FAKE16-GISEL-LABEL: fadd_v2f16_imm_b:
; GFX11-FAKE16-GISEL: ; %bb.0: ; %entry
-; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-FAKE16-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-FAKE16-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-FAKE16-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-GISEL-NEXT: global_load_b32 v0, v0, s[2:3]
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-GISEL-NEXT: global_load_b32 v0, v0, s[6:7]
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-FAKE16-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-FAKE16-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-GISEL-NEXT: v_pk_add_f16 v0, 0x3c004000, v0
-; GFX11-FAKE16-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-FAKE16-GISEL-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-FAKE16-GISEL-NEXT: s_nop 0
; GFX11-FAKE16-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index 581b7b4cff9ed..fb47dae5f6739 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -21,20 +21,20 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_undef_value_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_undef_value_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v0, s[0:1]
+; GFX9-NEXT: global_store_short v0, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_undef_value_f16:
@@ -49,10 +49,10 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace(
;
; GFX11-LABEL: test_fold_canonicalize_undef_value_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -64,10 +64,10 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(ptr addrspace(
define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: v_test_canonicalize_var_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_load_ushort v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_max_f16_e32 v0, v0, v0
@@ -76,10 +76,10 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1
;
; GFX9-LABEL: v_test_canonicalize_var_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v0, v0, s[0:1]
+; GFX9-NEXT: global_load_ushort v0, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
; GFX9-NEXT: global_store_short v[0:1], v0, off
@@ -100,10 +100,10 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1
;
; GFX11-LABEL: v_test_canonicalize_var_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v0, v0, s[0:1]
+; GFX11-NEXT: global_load_u16 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
; GFX11-NEXT: global_store_b16 v[0:1], v0, off
@@ -119,12 +119,12 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1
define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i16 zeroext %val.arg) #1 {
; VI-LABEL: s_test_canonicalize_var_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_max_f16_e64 v2, s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -153,12 +153,12 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i1
; GFX11-LABEL: s_test_canonicalize_var_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_max_f16_e64 v1, s2, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: v_max_f16_e64 v1, s4, s4
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -239,10 +239,10 @@ define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1
define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: v_test_canonicalize_fabs_var_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_load_ushort v2, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_max_f16_e64 v2, |v2|, |v2|
@@ -251,13 +251,13 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %ou
;
; GFX9-LABEL: v_test_canonicalize_fabs_var_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[0:1]
+; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e64 v1, |v1|, |v1|
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: v_test_canonicalize_fabs_var_f16:
@@ -275,13 +275,13 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %ou
;
; GFX11-LABEL: v_test_canonicalize_fabs_var_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[0:1]
+; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e64 v1, |v1|, |v1|
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -295,10 +295,10 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %ou
define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: v_test_canonicalize_fneg_fabs_var_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_load_ushort v2, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_max_f16_e64 v2, -|v2|, -|v2|
@@ -307,13 +307,13 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1
;
; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[0:1]
+; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e64 v1, -|v1|, -|v1|
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: v_test_canonicalize_fneg_fabs_var_f16:
@@ -331,13 +331,13 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1
;
; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[0:1]
+; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e64 v1, -|v1|, -|v1|
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -352,10 +352,10 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1
define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: v_test_canonicalize_fneg_var_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_load_ushort v2, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_max_f16_e64 v2, -v2, -v2
@@ -364,13 +364,13 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %ou
;
; GFX9-LABEL: v_test_canonicalize_fneg_var_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[0:1]
+; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e64 v1, -v1, -v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: v_test_canonicalize_fneg_var_f16:
@@ -388,13 +388,13 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %ou
;
; GFX11-LABEL: v_test_canonicalize_fneg_var_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[0:1]
+; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -408,10 +408,10 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %ou
define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr addrspace(1) %out) #2 {
; VI-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_load_ushort v2, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_f16_e32 v2, -1.0, v2
@@ -420,13 +420,13 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr add
;
; GFX9-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[0:1]
+; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e64 v1, -v1, -v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16:
@@ -444,13 +444,13 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr add
;
; GFX11-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[0:1]
+; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e64 v1, -v1, -v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -464,10 +464,10 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr add
define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(ptr addrspace(1) %out) #2 {
; VI-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_load_ushort v2, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_f16_e64 v2, -1.0, |v2|
@@ -476,13 +476,13 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(pt
;
; GFX9-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[0:1]
+; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e64 v1, -|v1|, -|v1|
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16:
@@ -500,13 +500,13 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(pt
;
; GFX11-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[0:1]
+; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e64 v1, -|v1|, -|v1|
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -521,20 +521,20 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(pt
define amdgpu_kernel void @test_fold_canonicalize_p0_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_p0_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_p0_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v0, s[0:1]
+; GFX9-NEXT: global_store_short v0, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_p0_f16:
@@ -549,10 +549,10 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f16(ptr addrspace(1) %out)
;
; GFX11-LABEL: test_fold_canonicalize_p0_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -564,21 +564,21 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f16(ptr addrspace(1) %out)
define amdgpu_kernel void @test_fold_canonicalize_n0_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_n0_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0xffff8000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_n0_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff8000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_n0_f16:
@@ -593,10 +593,10 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f16(ptr addrspace(1) %out)
;
; GFX11-LABEL: test_fold_canonicalize_n0_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff8000
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -608,21 +608,21 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f16(ptr addrspace(1) %out)
define amdgpu_kernel void @test_fold_canonicalize_p1_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_p1_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x3c00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_p1_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3c00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_p1_f16:
@@ -637,10 +637,10 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f16(ptr addrspace(1) %out)
;
; GFX11-LABEL: test_fold_canonicalize_p1_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -652,21 +652,21 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f16(ptr addrspace(1) %out)
define amdgpu_kernel void @test_fold_canonicalize_n1_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_n1_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0xffffbc00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_n1_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffffbc00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_n1_f16:
@@ -681,10 +681,10 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f16(ptr addrspace(1) %out)
;
; GFX11-LABEL: test_fold_canonicalize_n1_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffffbc00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -696,21 +696,21 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f16(ptr addrspace(1) %out)
define amdgpu_kernel void @test_fold_canonicalize_literal_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_literal_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x4c00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_literal_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x4c00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_literal_f16:
@@ -725,10 +725,10 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f16(ptr addrspace(1) %
;
; GFX11-LABEL: test_fold_canonicalize_literal_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4c00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -740,21 +740,21 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f16(ptr addrspace(1) %
define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x3ff
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16:
@@ -769,10 +769,10 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f1
;
; GFX11-LABEL: test_default_denormals_fold_canonicalize_denormal0_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -784,21 +784,21 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f1
define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(ptr addrspace(1) %out) #3 {
; VI-LABEL: test_denormals_fold_canonicalize_denormal0_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x3ff
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_denormals_fold_canonicalize_denormal0_f16:
@@ -813,10 +813,10 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(ptr ad
;
; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -828,21 +828,21 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(ptr ad
define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0xffff83ff
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff83ff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16:
@@ -857,10 +857,10 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f1
;
; GFX11-LABEL: test_default_denormals_fold_canonicalize_denormal1_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -872,21 +872,21 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f1
define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(ptr addrspace(1) %out) #3 {
; VI-LABEL: test_denormals_fold_canonicalize_denormal1_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0xffff83ff
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff83ff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_denormals_fold_canonicalize_denormal1_f16:
@@ -901,10 +901,10 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(ptr ad
;
; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -916,21 +916,21 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(ptr ad
define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_qnan_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7c00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_qnan_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7c00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_qnan_f16:
@@ -945,10 +945,10 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(ptr addrspace(1) %out
;
; GFX11-LABEL: test_fold_canonicalize_qnan_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -960,21 +960,21 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(ptr addrspace(1) %out
define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_qnan_value_neg1_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7e00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_qnan_value_neg1_f16:
@@ -989,10 +989,10 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(ptr addrsp
;
; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1004,21 +1004,21 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(ptr addrsp
define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_qnan_value_neg2_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7e00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_qnan_value_neg2_f16:
@@ -1033,10 +1033,10 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(ptr addrsp
;
; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1048,21 +1048,21 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(ptr addrsp
define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_snan0_value_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7e00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_snan0_value_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_snan0_value_f16:
@@ -1077,10 +1077,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(ptr addrspace(
;
; GFX11-LABEL: test_fold_canonicalize_snan0_value_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1092,21 +1092,21 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(ptr addrspace(
define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_snan1_value_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7e00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_snan1_value_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_snan1_value_f16:
@@ -1121,10 +1121,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(ptr addrspace(
;
; GFX11-LABEL: test_fold_canonicalize_snan1_value_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1136,21 +1136,21 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(ptr addrspace(
define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_snan2_value_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7e00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_snan2_value_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_snan2_value_f16:
@@ -1165,10 +1165,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(ptr addrspace(
;
; GFX11-LABEL: test_fold_canonicalize_snan2_value_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1180,21 +1180,21 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(ptr addrspace(
define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_snan3_value_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7e00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_snan3_value_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_snan3_value_f16:
@@ -1209,10 +1209,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(ptr addrspace(
;
; GFX11-LABEL: test_fold_canonicalize_snan3_value_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1224,32 +1224,32 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(ptr addrspace(
define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: v_test_canonicalize_var_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_max_f16_sdwa v1, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: v_or_b32_e32 v2, v0, v1
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_canonicalize_var_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[0:1]
+; GFX9-NEXT: global_load_dword v0, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v1, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: v_test_canonicalize_var_v2f16:
@@ -1277,13 +1277,13 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out)
;
; GFX11-LABEL: v_test_canonicalize_var_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1298,33 +1298,33 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out)
define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: v_test_canonicalize_fabs_var_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_max_f16_sdwa v1, |v0|, |v0| dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_max_f16_e64 v0, |v0|, |v0|
; VI-NEXT: v_or_b32_e32 v2, v0, v1
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_canonicalize_fabs_var_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[0:1]
+; GFX9-NEXT: global_load_dword v0, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v1, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: v_test_canonicalize_fabs_var_v2f16:
@@ -1352,15 +1352,15 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) %
;
; GFX11-LABEL: v_test_canonicalize_fabs_var_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v0, v0, v0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1376,33 +1376,33 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) %
define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_max_f16_sdwa v1, -|v0|, -|v0| dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_max_f16_e64 v0, -|v0|, -|v0|
; VI-NEXT: v_or_b32_e32 v2, v0, v1
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[0:1]
+; GFX9-NEXT: global_load_dword v0, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1]
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v1, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16:
@@ -1431,15 +1431,15 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace
;
; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1]
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1456,32 +1456,32 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace
define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: v_test_canonicalize_fneg_var_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_max_f16_sdwa v1, -v0, -v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_max_f16_e64 v0, -v0, -v0
; VI-NEXT: v_or_b32_e32 v2, v0, v1
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_canonicalize_fneg_var_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[0:1]
+; GFX9-NEXT: global_load_dword v0, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1]
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v1, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: v_test_canonicalize_fneg_var_v2f16:
@@ -1510,13 +1510,13 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) %
;
; GFX11-LABEL: v_test_canonicalize_fneg_var_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_max_f16 v0, v0, v0 neg_lo:[1,1] neg_hi:[1,1]
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1532,16 +1532,16 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) %
define amdgpu_kernel void @s_test_canonicalize_var_v2f16(ptr addrspace(1) %out, i32 zeroext %val.arg) #1 {
; VI-LABEL: s_test_canonicalize_var_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s3, s2, 16
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_max_f16_e64 v0, s2, s2
+; VI-NEXT: s_lshr_b32 s0, s4, 16
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_max_f16_e64 v0, s4, s4
; VI-NEXT: v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, v0, v1
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1575,12 +1575,12 @@ define amdgpu_kernel void @s_test_canonicalize_var_v2f16(ptr addrspace(1) %out,
; GFX11-LABEL: s_test_canonicalize_var_v2f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v1, s2, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_pk_max_f16 v1, s4, s4
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1593,20 +1593,20 @@ define amdgpu_kernel void @s_test_canonicalize_var_v2f16(ptr addrspace(1) %out,
define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_p0_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_p0_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_p0_v2f16:
@@ -1621,10 +1621,10 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(ptr addrspace(1) %out
;
; GFX11-LABEL: test_fold_canonicalize_p0_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1636,21 +1636,21 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(ptr addrspace(1) %out
define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_n0_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x80008000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_n0_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x80008000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_n0_v2f16:
@@ -1665,10 +1665,10 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(ptr addrspace(1) %out
;
; GFX11-LABEL: test_fold_canonicalize_n0_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x80008000
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1680,21 +1680,21 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(ptr addrspace(1) %out
define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_p1_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x3c003c00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_p1_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3c003c00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_p1_v2f16:
@@ -1709,10 +1709,10 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(ptr addrspace(1) %out
;
; GFX11-LABEL: test_fold_canonicalize_p1_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c003c00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1724,21 +1724,21 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(ptr addrspace(1) %out
define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_n1_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0xbc00bc00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_n1_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0xbc00bc00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_n1_v2f16:
@@ -1753,10 +1753,10 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(ptr addrspace(1) %out
;
; GFX11-LABEL: test_fold_canonicalize_n1_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbc00bc00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1768,21 +1768,21 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(ptr addrspace(1) %out
define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_literal_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x4c004c00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_literal_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x4c004c00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_literal_v2f16:
@@ -1797,10 +1797,10 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(ptr addrspace(1)
;
; GFX11-LABEL: test_fold_canonicalize_literal_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4c004c00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1812,21 +1812,21 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(ptr addrspace(1)
define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x3ff03ff
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff03ff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16:
@@ -1841,10 +1841,10 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(p
;
; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1856,21 +1856,21 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(p
define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(ptr addrspace(1) %out) #3 {
; VI-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x3ff03ff
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3ff03ff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16:
@@ -1885,10 +1885,10 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(ptr
;
; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1900,21 +1900,21 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(ptr
define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x83ff83ff
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x83ff83ff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16:
@@ -1929,10 +1929,10 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(p
;
; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1944,21 +1944,21 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(p
define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(ptr addrspace(1) %out) #3 {
; VI-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x83ff83ff
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x83ff83ff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16:
@@ -1973,10 +1973,10 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(ptr
;
; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1988,21 +1988,21 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(ptr
define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_qnan_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7c007c00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_qnan_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7c007c00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_qnan_v2f16:
@@ -2017,10 +2017,10 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(ptr addrspace(1) %o
;
; GFX11-LABEL: test_fold_canonicalize_qnan_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c007c00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2032,21 +2032,21 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(ptr addrspace(1) %o
define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16:
@@ -2061,10 +2061,10 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(ptr addr
;
; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2076,21 +2076,21 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(ptr addr
define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16:
@@ -2105,10 +2105,10 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(ptr addr
;
; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2120,21 +2120,21 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(ptr addr
define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_snan0_value_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_snan0_value_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_snan0_value_v2f16:
@@ -2149,10 +2149,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(ptr addrspac
;
; GFX11-LABEL: test_fold_canonicalize_snan0_value_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2164,21 +2164,21 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(ptr addrspac
define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_snan1_value_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_snan1_value_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_snan1_value_v2f16:
@@ -2193,10 +2193,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(ptr addrspac
;
; GFX11-LABEL: test_fold_canonicalize_snan1_value_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2208,21 +2208,21 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(ptr addrspac
define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_snan2_value_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_snan2_value_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_snan2_value_v2f16:
@@ -2237,10 +2237,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(ptr addrspac
;
; GFX11-LABEL: test_fold_canonicalize_snan2_value_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2252,21 +2252,21 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(ptr addrspac
define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: test_fold_canonicalize_snan3_value_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x7e007e00
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fold_canonicalize_snan3_value_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7e007e00
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: test_fold_canonicalize_snan3_value_v2f16:
@@ -2281,10 +2281,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(ptr addrspac
;
; GFX11-LABEL: test_fold_canonicalize_snan3_value_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7e007e00
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2376,20 +2376,20 @@ define <4 x half> @v_test_canonicalize_var_v4f16(<4 x half> %val) #1 {
define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: s_test_canonicalize_undef_v2f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_test_canonicalize_undef_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: s_test_canonicalize_undef_v2f16:
@@ -2404,10 +2404,10 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(ptr addrspace(1) %out
;
; GFX11-LABEL: s_test_canonicalize_undef_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2678,22 +2678,22 @@ define <2 x half> @v_test_canonicalize_k_reg_v2f16(half %val) #1 {
define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(ptr addrspace(1) %out) #1 {
; VI-LABEL: s_test_canonicalize_undef_v4f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_test_canonicalize_undef_v4f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[2:3]
; GFX9-NEXT: s_endpgm
;
; CI-LABEL: s_test_canonicalize_undef_v4f16:
@@ -2709,12 +2709,12 @@ define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(ptr addrspace(1) %out
;
; GFX11-LABEL: s_test_canonicalize_undef_v4f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v0, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
index 7d8f43bbe16b7..038aad313a901 100644
--- a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
@@ -58,25 +58,25 @@ define amdgpu_kernel void @fcmp_f16_lt(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -147,26 +147,26 @@ define amdgpu_kernel void @fcmp_f16_lt_abs(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_lt_f16_e64 s0, |v0|, |v1|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, s0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -239,25 +239,25 @@ define amdgpu_kernel void @fcmp_f16_eq(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_eq_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -328,25 +328,25 @@ define amdgpu_kernel void @fcmp_f16_le(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_le_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -417,25 +417,25 @@ define amdgpu_kernel void @fcmp_f16_gt(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -506,25 +506,25 @@ define amdgpu_kernel void @fcmp_f16_lg(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_lg_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -595,25 +595,25 @@ define amdgpu_kernel void @fcmp_f16_ge(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -684,25 +684,25 @@ define amdgpu_kernel void @fcmp_f16_o(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -773,25 +773,25 @@ define amdgpu_kernel void @fcmp_f16_u(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_u_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -862,25 +862,25 @@ define amdgpu_kernel void @fcmp_f16_nge(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -951,25 +951,25 @@ define amdgpu_kernel void @fcmp_f16_nlg(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1040,25 +1040,25 @@ define amdgpu_kernel void @fcmp_f16_ngt(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1129,25 +1129,25 @@ define amdgpu_kernel void @fcmp_f16_nle(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_nle_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1218,25 +1218,25 @@ define amdgpu_kernel void @fcmp_f16_neq(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_neq_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1307,25 +1307,25 @@ define amdgpu_kernel void @fcmp_f16_nlt(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1406,20 +1406,20 @@ define amdgpu_kernel void @fcmp_v2f16_lt(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -1429,7 +1429,7 @@ define amdgpu_kernel void @fcmp_v2f16_lt(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1511,20 +1511,20 @@ define amdgpu_kernel void @fcmp_v2f16_eq(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -1534,7 +1534,7 @@ define amdgpu_kernel void @fcmp_v2f16_eq(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_eq_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1615,20 +1615,20 @@ define amdgpu_kernel void @fcmp_v2f16_le(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -1638,7 +1638,7 @@ define amdgpu_kernel void @fcmp_v2f16_le(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_le_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1719,20 +1719,20 @@ define amdgpu_kernel void @fcmp_v2f16_gt(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -1742,7 +1742,7 @@ define amdgpu_kernel void @fcmp_v2f16_gt(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1824,20 +1824,20 @@ define amdgpu_kernel void @fcmp_v2f16_lg(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -1847,7 +1847,7 @@ define amdgpu_kernel void @fcmp_v2f16_lg(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_lg_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1929,20 +1929,20 @@ define amdgpu_kernel void @fcmp_v2f16_ge(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -1952,7 +1952,7 @@ define amdgpu_kernel void @fcmp_v2f16_ge(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_ge_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2034,20 +2034,20 @@ define amdgpu_kernel void @fcmp_v2f16_o(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2057,7 +2057,7 @@ define amdgpu_kernel void @fcmp_v2f16_o(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2139,20 +2139,20 @@ define amdgpu_kernel void @fcmp_v2f16_u(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2162,7 +2162,7 @@ define amdgpu_kernel void @fcmp_v2f16_u(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_u_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2243,20 +2243,20 @@ define amdgpu_kernel void @fcmp_v2f16_nge(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2266,7 +2266,7 @@ define amdgpu_kernel void @fcmp_v2f16_nge(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_nge_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2347,20 +2347,20 @@ define amdgpu_kernel void @fcmp_v2f16_nlg(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2370,7 +2370,7 @@ define amdgpu_kernel void @fcmp_v2f16_nlg(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_nlg_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2452,20 +2452,20 @@ define amdgpu_kernel void @fcmp_v2f16_ngt(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2475,7 +2475,7 @@ define amdgpu_kernel void @fcmp_v2f16_ngt(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_ngt_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2556,20 +2556,20 @@ define amdgpu_kernel void @fcmp_v2f16_nle(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2579,7 +2579,7 @@ define amdgpu_kernel void @fcmp_v2f16_nle(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_nle_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2660,20 +2660,20 @@ define amdgpu_kernel void @fcmp_v2f16_neq(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2683,7 +2683,7 @@ define amdgpu_kernel void @fcmp_v2f16_neq(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_neq_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2764,20 +2764,20 @@ define amdgpu_kernel void @fcmp_v2f16_nlt(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -2787,7 +2787,7 @@ define amdgpu_kernel void @fcmp_v2f16_nlt(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v3, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index fd80580c2fb27..b2fadbdceebf5 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -31,16 +31,16 @@ define amdgpu_kernel void @s_copysign_f16(ptr addrspace(1) %arg_out, half %mag,
;
; VI-LABEL: s_copysign_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_movk_i32 s3, 0x7fff
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_movk_i32 s0, 0x7fff
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s4, s2, 16
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s4
-; VI-NEXT: v_bfi_b32 v2, s3, v0, v1
-; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_lshr_b32 s1, s4, 16
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_bfi_b32 v2, s0, v0, v1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -61,15 +61,15 @@ define amdgpu_kernel void @s_copysign_f16(ptr addrspace(1) %arg_out, half %mag,
; GFX11-LABEL: s_copysign_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s3, s2, 16
+; GFX11-NEXT: s_lshr_b32 s0, s4, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b32_e32 v0, s3
-; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s2, v0
-; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s4, v0
+; GFX11-NEXT: global_store_b16 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -93,13 +93,13 @@ define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %ma
;
; VI-LABEL: s_test_copysign_f16_0:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0x7fff
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0x7fff
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -117,13 +117,13 @@ define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %ma
; GFX11-LABEL: s_test_copysign_f16_0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff
+; GFX11-NEXT: s_and_b32 s0, s4, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -147,13 +147,13 @@ define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %ma
;
; VI-LABEL: s_test_copysign_f16_1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0x7fff
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0x7fff
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -171,13 +171,13 @@ define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %ma
; GFX11-LABEL: s_test_copysign_f16_1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff
+; GFX11-NEXT: s_and_b32 s0, s4, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -201,13 +201,13 @@ define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half
;
; VI-LABEL: s_test_copysign_f16_10.0:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0x7fff
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0x7fff
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -225,13 +225,13 @@ define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half
; GFX11-LABEL: s_test_copysign_f16_10.0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s2, 0x7fff
+; GFX11-NEXT: s_and_b32 s0, s4, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -255,13 +255,13 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half
;
; VI-LABEL: s_test_copysign_f16_neg1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset1_b32 s2, 15
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_or_b32 s0, s4, 0x8000
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -279,13 +279,13 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half
; GFX11-LABEL: s_test_copysign_f16_neg1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset1_b32 s2, 15
+; GFX11-NEXT: s_or_b32 s0, s4, 0x8000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -309,13 +309,13 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half
;
; VI-LABEL: s_test_copysign_f16_neg10:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset1_b32 s2, 15
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_or_b32 s0, s4, 0x8000
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -333,13 +333,13 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half
; GFX11-LABEL: s_test_copysign_f16_neg10:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset1_b32 s2, 15
+; GFX11-NEXT: s_or_b32 s0, s4, 0x8000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -365,13 +365,13 @@ define amdgpu_kernel void @s_test_copysign_f16_0_mag(ptr addrspace(1) %out, half
;
; VI-LABEL: s_test_copysign_f16_0_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v2, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_and_b32_e32 v2, s4, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -389,12 +389,12 @@ define amdgpu_kernel void @s_test_copysign_f16_0_mag(ptr addrspace(1) %out, half
; GFX11-LABEL: s_test_copysign_f16_0_mag:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e64 v1, 0xffff8000, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: v_and_b32_e64 v1, 0xffff8000, s4
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -421,14 +421,14 @@ define amdgpu_kernel void @s_test_copysign_f16_1_mag(ptr addrspace(1) %out, half
;
; VI-LABEL: s_test_copysign_f16_1_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v0, s2, v0
+; VI-NEXT: v_and_b32_e32 v0, s4, v0
; VI-NEXT: v_or_b32_e32 v2, 0x3c00, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -447,14 +447,14 @@ define amdgpu_kernel void @s_test_copysign_f16_1_mag(ptr addrspace(1) %out, half
; GFX11-LABEL: s_test_copysign_f16_1_mag:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2
+; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_or_b32_e32 v0, 0x3c00, v0
-; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -481,14 +481,14 @@ define amdgpu_kernel void @s_test_copysign_f16_10_mag(ptr addrspace(1) %out, hal
;
; VI-LABEL: s_test_copysign_f16_10_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v0, s2, v0
+; VI-NEXT: v_and_b32_e32 v0, s4, v0
; VI-NEXT: v_or_b32_e32 v2, 0x4900, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -507,14 +507,14 @@ define amdgpu_kernel void @s_test_copysign_f16_10_mag(ptr addrspace(1) %out, hal
; GFX11-LABEL: s_test_copysign_f16_10_mag:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2
+; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_or_b32_e32 v0, 0x4900, v0
-; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -540,14 +540,14 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1_mag(ptr addrspace(1) %out, h
;
; VI-LABEL: s_test_copysign_f16_neg1_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v0, s2, v0
+; VI-NEXT: v_and_b32_e32 v0, s4, v0
; VI-NEXT: v_or_b32_e32 v2, 0x3c00, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -566,14 +566,14 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1_mag(ptr addrspace(1) %out, h
; GFX11-LABEL: s_test_copysign_f16_neg1_mag:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2
+; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_or_b32_e32 v0, 0x3c00, v0
-; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -600,14 +600,14 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10_mag(ptr addrspace(1) %out,
;
; VI-LABEL: s_test_copysign_f16_neg10_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0xffff8000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_and_b32_e32 v0, s2, v0
+; VI-NEXT: v_and_b32_e32 v0, s4, v0
; VI-NEXT: v_or_b32_e32 v2, 0x4900, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -626,14 +626,14 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10_mag(ptr addrspace(1) %out,
; GFX11-LABEL: s_test_copysign_f16_neg10_mag:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s2
+; GFX11-NEXT: v_and_b32_e64 v0, 0xffff8000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_or_b32_e32 v0, 0x4900, v0
-; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -850,19 +850,19 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1)
; VI-LABEL: v_copysign_out_f32_mag_f16_sign_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_ushort v2, v[1:2]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -894,12 +894,12 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f16_sign_f32(ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v1, v1, s[6:7]
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -950,19 +950,19 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1)
; VI-LABEL: v_copysign_out_f64_mag_f16_sign_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_ushort v2, v[1:2]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -993,13 +993,14 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f16_sign_f64(ptr addrspace(1)
;
; GFX11-LABEL: v_copysign_out_f64_mag_f16_sign_f64:
; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v2, v1, s[6:7]
-; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1]
+; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -1050,19 +1051,19 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1)
; VI-LABEL: v_copysign_out_f32_mag_f32_sign_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v4, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v2, v[0:1]
-; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -1093,19 +1094,19 @@ define amdgpu_kernel void @v_copysign_out_f32_mag_f32_sign_f16(ptr addrspace(1)
; GFX11-LABEL: v_copysign_out_f32_mag_f32_sign_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 1, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v1, s[4:5]
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
+; GFX11-NEXT: global_load_u16 v1, v1, s[2:3]
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v1
-; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v2, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1151,19 +1152,19 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1)
; VI-LABEL: v_copysign_out_f64_mag_f64_sign_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v4, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -1194,19 +1195,19 @@ define amdgpu_kernel void @v_copysign_out_f64_mag_f64_sign_f16(ptr addrspace(1)
; GFX11-LABEL: v_copysign_out_f64_mag_f64_sign_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v2, v1, s[4:5]
-; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
+; GFX11-NEXT: global_load_u16 v2, v1, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2
-; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1252,19 +1253,19 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1)
; VI-LABEL: v_copysign_out_f16_mag_f16_sign_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_movk_i32 s0, 0x7fff
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v4, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; VI-NEXT: flat_load_ushort v2, v[0:1]
-; VI-NEXT: s_movk_i32 s0, 0x7fff
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -1295,19 +1296,19 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f32(ptr addrspace(1)
; GFX11-LABEL: v_copysign_out_f16_mag_f16_sign_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v1, s[4:5]
-; GFX11-NEXT: global_load_u16 v0, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v1, s[2:3]
+; GFX11-NEXT: global_load_u16 v0, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
-; GFX11-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v2, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1353,18 +1354,18 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1)
; VI-LABEL: v_copysign_out_f16_mag_f16_sign_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v1, 3, v0
+; VI-NEXT: s_movk_i32 s0, 0x7fff
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mov_b32_e32 v2, s1
-; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_load_ushort v3, v[0:1]
-; VI-NEXT: s_movk_i32 s0, 0x7fff
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
@@ -1393,19 +1394,19 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1)
; GFX11-LABEL: v_copysign_out_f16_mag_f16_sign_f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v0, s[4:5]
-; GFX11-NEXT: global_load_u16 v0, v2, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
+; GFX11-NEXT: global_load_u16 v0, v2, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
-; GFX11-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v2, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1453,19 +1454,19 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1)
; VI-LABEL: v_copysign_out_f16_mag_f32_sign_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; VI-NEXT: s_movk_i32 s0, 0x7fff
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dword v2, v[1:2]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v3, v[0:1]
-; VI-NEXT: s_movk_i32 s0, 0x7fff
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -1497,12 +1498,12 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f32_sign_f16(ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v1, s[6:7]
-; GFX11-NEXT: global_load_u16 v0, v0, s[0:1]
+; GFX11-NEXT: global_load_u16 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -1904,29 +1905,29 @@ define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half
; VI-LABEL: s_copysign_v3f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_movk_i32 s2, 0x7fff
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_movk_i32 s0, 0x7fff
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s6
-; VI-NEXT: s_lshr_b32 s3, s6, 16
+; VI-NEXT: s_lshr_b32 s1, s6, 16
; VI-NEXT: s_lshr_b32 s4, s4, 16
-; VI-NEXT: v_bfi_b32 v0, s2, v0, v1
+; VI-NEXT: v_bfi_b32 v0, s0, v0, v1
; VI-NEXT: v_mov_b32_e32 v1, s4
-; VI-NEXT: v_mov_b32_e32 v2, s3
-; VI-NEXT: v_bfi_b32 v1, s2, v1, v2
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_bfi_b32 v1, s0, v1, v2
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v0, s5
; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_bfi_b32 v3, s2, v0, v1
-; VI-NEXT: s_add_u32 s2, s0, 4
-; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: flat_store_short v[0:1], v3
+; VI-NEXT: v_bfi_b32 v3, s0, v0, v1
+; VI-NEXT: s_add_u32 s0, s2, 4
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: flat_store_short v[0:1], v3
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1958,24 +1959,24 @@ define amdgpu_kernel void @s_copysign_v3f16(ptr addrspace(1) %arg_out, <3 x half
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s2, s6, 16
+; GFX11-NEXT: s_lshr_b32 s0, s6, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: s_lshr_b32 s2, s4, 16
+; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: s_lshr_b32 s0, s4, 16
; GFX11-NEXT: v_mov_b32_e32 v2, s7
; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s4, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s2, v1
+; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s0, v1
; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, s5, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b16 v3, v2, s[0:1] offset:4
-; GFX11-NEXT: global_store_b32 v3, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v3, v2, s[2:3] offset:4
+; GFX11-NEXT: global_store_b32 v3, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2023,31 +2024,31 @@ define amdgpu_kernel void @s_copysign_v4f16(ptr addrspace(1) %arg_out, <4 x half
; VI-LABEL: s_copysign_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_movk_i32 s2, 0x7fff
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_movk_i32 s0, 0x7fff
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s5
; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: s_lshr_b32 s3, s7, 16
+; VI-NEXT: s_lshr_b32 s1, s7, 16
; VI-NEXT: s_lshr_b32 s5, s5, 16
-; VI-NEXT: v_bfi_b32 v0, s2, v0, v1
+; VI-NEXT: v_bfi_b32 v0, s0, v0, v1
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s3
-; VI-NEXT: v_bfi_b32 v1, s2, v1, v2
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_bfi_b32 v1, s0, v1, v2
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: s_lshr_b32 s3, s6, 16
+; VI-NEXT: s_lshr_b32 s1, s6, 16
; VI-NEXT: s_lshr_b32 s4, s4, 16
-; VI-NEXT: v_bfi_b32 v0, s2, v0, v2
+; VI-NEXT: v_bfi_b32 v0, s0, v0, v2
; VI-NEXT: v_mov_b32_e32 v2, s4
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_bfi_b32 v2, s2, v2, v3
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_bfi_b32 v2, s0, v2, v3
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -2085,26 +2086,26 @@ define amdgpu_kernel void @s_copysign_v4f16(ptr addrspace(1) %arg_out, <4 x half
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v0, s7
; GFX11-NEXT: v_mov_b32_e32 v1, s6
-; GFX11-NEXT: s_lshr_b32 s2, s7, 16
+; GFX11-NEXT: s_lshr_b32 s0, s7, 16
; GFX11-NEXT: s_lshr_b32 s6, s6, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s6
+; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s6
; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, s5, v0
; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, s4, v1
-; GFX11-NEXT: s_lshr_b32 s3, s5, 16
-; GFX11-NEXT: s_lshr_b32 s2, s4, 16
-; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, s3, v2
-; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, s2, v3
+; GFX11-NEXT: s_lshr_b32 s1, s5, 16
+; GFX11-NEXT: s_lshr_b32 s0, s4, 16
+; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, s1, v2
+; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, s0, v3
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_lshl_or_b32 v1, v2, 16, v0
; GFX11-NEXT: v_lshl_or_b32 v0, v3, 16, v4
-; GFX11-NEXT: global_store_b64 v5, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v5, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
index fb04b66e1a6ad..3f5d90ece2f14 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
@@ -63,26 +63,26 @@ define amdgpu_kernel void @s_test_copysign_f32_0(ptr addrspace(1) %out, float %m
;
; VI-LABEL: s_test_copysign_f32_0:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset0_b32 s2, 31
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0x7fffffff
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32_0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset0_b32 s2, 31
+; GFX11-NEXT: s_and_b32 s0, s4, 0x7fffffff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -106,26 +106,26 @@ define amdgpu_kernel void @s_test_copysign_f32_1(ptr addrspace(1) %out, float %m
;
; VI-LABEL: s_test_copysign_f32_1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset0_b32 s2, 31
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0x7fffffff
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32_1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset0_b32 s2, 31
+; GFX11-NEXT: s_and_b32 s0, s4, 0x7fffffff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -149,26 +149,26 @@ define amdgpu_kernel void @s_test_copysign_f32_10.0(ptr addrspace(1) %out, float
;
; VI-LABEL: s_test_copysign_f32_10.0:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset0_b32 s2, 31
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0x7fffffff
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32_10.0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset0_b32 s2, 31
+; GFX11-NEXT: s_and_b32 s0, s4, 0x7fffffff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -192,26 +192,26 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1(ptr addrspace(1) %out, float
;
; VI-LABEL: s_test_copysign_f32_neg1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset1_b32 s2, 31
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_or_b32 s0, s4, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32_neg1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset1_b32 s2, 31
+; GFX11-NEXT: s_or_b32 s0, s4, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -235,26 +235,26 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10(ptr addrspace(1) %out, floa
;
; VI-LABEL: s_test_copysign_f32_neg10:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset1_b32 s2, 31
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_or_b32 s0, s4, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32_neg10:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset1_b32 s2, 31
+; GFX11-NEXT: s_or_b32 s0, s4, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -278,26 +278,26 @@ define amdgpu_kernel void @s_test_copysign_f32_0_mag(ptr addrspace(1) %out, floa
;
; VI-LABEL: s_test_copysign_f32_0_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32_0_mag:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_and_b32 s0, s4, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -323,28 +323,28 @@ define amdgpu_kernel void @s_test_copysign_f32_1_mag(ptr addrspace(1) %out, floa
;
; VI-LABEL: s_test_copysign_f32_1_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0x80000000
-; VI-NEXT: s_or_b32 s2, s2, 1.0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0x80000000
+; VI-NEXT: s_or_b32 s0, s0, 1.0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32_1_mag:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_and_b32 s0, s4, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_or_b32 s2, s2, 1.0
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_or_b32 s0, s0, 1.0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -369,28 +369,28 @@ define amdgpu_kernel void @s_test_copysign_f32_10_mag(ptr addrspace(1) %out, flo
;
; VI-LABEL: s_test_copysign_f32_10_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0x80000000
-; VI-NEXT: s_or_b32 s2, s2, 0x41200000
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0x80000000
+; VI-NEXT: s_or_b32 s0, s0, 0x41200000
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32_10_mag:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_and_b32 s0, s4, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_or_b32 s2, s2, 0x41200000
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_or_b32 s0, s0, 0x41200000
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -415,28 +415,28 @@ define amdgpu_kernel void @s_test_copysign_f32_neg1_mag(ptr addrspace(1) %out, f
;
; VI-LABEL: s_test_copysign_f32_neg1_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0x80000000
-; VI-NEXT: s_or_b32 s2, s2, 1.0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0x80000000
+; VI-NEXT: s_or_b32 s0, s0, 1.0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32_neg1_mag:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_and_b32 s0, s4, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_or_b32 s2, s2, 1.0
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_or_b32 s0, s0, 1.0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -461,28 +461,28 @@ define amdgpu_kernel void @s_test_copysign_f32_neg10_mag(ptr addrspace(1) %out,
;
; VI-LABEL: s_test_copysign_f32_neg10_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0x80000000
-; VI-NEXT: s_or_b32 s2, s2, 0x41200000
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0x80000000
+; VI-NEXT: s_or_b32 s0, s0, 0x41200000
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32_neg10_mag:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_and_b32 s0, s4, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_or_b32 s2, s2, 0x41200000
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_or_b32 s0, s0, 0x41200000
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -512,17 +512,17 @@ define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x flo
; VI-LABEL: s_test_copysign_v2f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_brev_b32 s2, -2
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s5
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_mov_b32_e32 v2, s4
-; VI-NEXT: v_bfi_b32 v1, s2, v0, v1
+; VI-NEXT: v_bfi_b32 v1, s0, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_bfi_b32 v0, s2, v2, v0
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_bfi_b32 v0, s0, v2, v0
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -530,14 +530,14 @@ define amdgpu_kernel void @s_test_copysign_v2f32(ptr addrspace(1) %out, <2 x flo
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7
; GFX11-NEXT: v_mov_b32_e32 v2, s6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v0
; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s4, v2
-; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -571,20 +571,20 @@ define amdgpu_kernel void @s_test_copysign_v3f32(ptr addrspace(1) %out, <3 x flo
; VI-LABEL: s_test_copysign_v3f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_brev_b32 s2, -2
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_brev_b32 s7, -2
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s10
; VI-NEXT: v_mov_b32_e32 v3, s5
-; VI-NEXT: v_bfi_b32 v2, s2, v0, v1
+; VI-NEXT: v_bfi_b32 v2, s7, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s9
-; VI-NEXT: v_bfi_b32 v1, s2, v3, v0
+; VI-NEXT: v_bfi_b32 v1, s7, v3, v0
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v3, s8
-; VI-NEXT: v_bfi_b32 v0, s2, v0, v3
-; VI-NEXT: v_mov_b32_e32 v4, s1
-; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_bfi_b32 v0, s7, v0, v3
+; VI-NEXT: v_mov_b32_e32 v4, s3
+; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-NEXT: s_endpgm
;
@@ -592,7 +592,7 @@ define amdgpu_kernel void @s_test_copysign_v3f32(ptr addrspace(1) %out, <3 x flo
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s10 :: v_dual_mov_b32 v1, s9
@@ -602,7 +602,7 @@ define amdgpu_kernel void @s_test_copysign_v3f32(ptr addrspace(1) %out, <3 x flo
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s4, v3
-; GFX11-NEXT: global_store_b96 v4, v[0:2], s[0:1]
+; GFX11-NEXT: global_store_b96 v4, v[0:2], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -638,23 +638,23 @@ define amdgpu_kernel void @s_test_copysign_v4f32(ptr addrspace(1) %out, <4 x flo
; VI-LABEL: s_test_copysign_v4f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_brev_b32 s2, -2
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_brev_b32 s12, -2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s7
; VI-NEXT: v_mov_b32_e32 v1, s11
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_bfi_b32 v3, s2, v0, v1
+; VI-NEXT: v_bfi_b32 v3, s12, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s10
-; VI-NEXT: v_bfi_b32 v2, s2, v2, v0
+; VI-NEXT: v_bfi_b32 v2, s12, v2, v0
; VI-NEXT: v_mov_b32_e32 v0, s5
; VI-NEXT: v_mov_b32_e32 v1, s9
-; VI-NEXT: v_bfi_b32 v1, s2, v0, v1
+; VI-NEXT: v_bfi_b32 v1, s12, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v4, s8
-; VI-NEXT: v_bfi_b32 v0, s2, v0, v4
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_bfi_b32 v0, s12, v0, v4
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -662,7 +662,7 @@ define amdgpu_kernel void @s_test_copysign_v4f32(ptr addrspace(1) %out, <4 x flo
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v6, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s11 :: v_dual_mov_b32 v1, s10
@@ -673,7 +673,7 @@ define amdgpu_kernel void @s_test_copysign_v4f32(ptr addrspace(1) %out, <4 x flo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v4
; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s4, v5
-; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b128 v6, v[0:3], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -921,16 +921,16 @@ define amdgpu_kernel void @s_test_copysign_f32_fptrunc_f64(ptr addrspace(1) %out
;
; VI-LABEL: s_test_copysign_f32_fptrunc_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dword s6, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_brev_b32 s2, -2
-; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_bfi_b32 v2, s2, v0, v1
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_bfi_b32 v2, s0, v0, v1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -940,12 +940,12 @@ define amdgpu_kernel void @s_test_copysign_f32_fptrunc_f64(ptr addrspace(1) %out
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -972,25 +972,25 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fptrunc_f64(ptr addrspace(1) %o
;
; VI-LABEL: s_test_copysign_f32_1_fptrunc_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_and_b32 s0, s3, 0x80000000
+; VI-NEXT: s_and_b32 s0, s7, 0x80000000
; VI-NEXT: s_or_b32 s0, s0, 1.0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32_1_fptrunc_f64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000
+; GFX11-NEXT: s_and_b32 s0, s7, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_or_b32 s2, s2, 1.0
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_or_b32 s0, s0, 1.0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1063,31 +1063,31 @@ define amdgpu_kernel void @s_test_copysign_f32_1_fpext_f16(ptr addrspace(1) %out
;
; VI-LABEL: s_test_copysign_f32_1_fpext_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b32 s2, s2, 16
-; VI-NEXT: s_and_b32 s2, s2, 0x80000000
-; VI-NEXT: s_or_b32 s2, s2, 1.0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_lshl_b32 s0, s4, 16
+; VI-NEXT: s_and_b32 s0, s0, 0x80000000
+; VI-NEXT: s_or_b32 s0, s0, 1.0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f32_1_fpext_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshl_b32 s2, s2, 16
+; GFX11-NEXT: s_lshl_b32 s0, s4, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_b32 s2, s2, 0x80000000
-; GFX11-NEXT: s_or_b32 s2, s2, 1.0
+; GFX11-NEXT: s_and_b32 s0, s0, 0x80000000
+; GFX11-NEXT: s_or_b32 s0, s0, 1.0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
index b5fa3fd9eccc1..5d5a4e9564cda 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
@@ -29,15 +29,15 @@ define amdgpu_kernel void @s_test_copysign_f64(ptr addrspace(1) %out, [8 x i32],
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
+; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_brev_b32 s4, -2
; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_bfi_b32 v1, s4, v0, v1
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_bfi_b32 v1, s0, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -46,14 +46,14 @@ define amdgpu_kernel void @s_test_copysign_f64(ptr addrspace(1) %out, [8 x i32],
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x74
; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x4c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[6:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, s3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v0
; GFX11-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -79,13 +79,13 @@ define amdgpu_kernel void @s_test_copysign_f64_0(ptr addrspace(1) %out, [8 x i32
; VI-LABEL: s_test_copysign_f64_0:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset0_b32 s3, 31
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_and_b32 s0, s3, 0x7fffffff
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -93,13 +93,13 @@ define amdgpu_kernel void @s_test_copysign_f64_0(ptr addrspace(1) %out, [8 x i32
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset0_b32 s3, 31
+; GFX11-NEXT: s_and_b32 s0, s3, 0x7fffffff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -125,13 +125,13 @@ define amdgpu_kernel void @s_test_copysign_f64_1(ptr addrspace(1) %out, [8 x i32
; VI-LABEL: s_test_copysign_f64_1:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset0_b32 s3, 31
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_and_b32 s0, s3, 0x7fffffff
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -139,13 +139,13 @@ define amdgpu_kernel void @s_test_copysign_f64_1(ptr addrspace(1) %out, [8 x i32
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset0_b32 s3, 31
+; GFX11-NEXT: s_and_b32 s0, s3, 0x7fffffff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -171,13 +171,13 @@ define amdgpu_kernel void @s_test_copysign_f64_10(ptr addrspace(1) %out, [8 x i3
; VI-LABEL: s_test_copysign_f64_10:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset0_b32 s3, 31
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_and_b32 s0, s3, 0x7fffffff
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -185,13 +185,13 @@ define amdgpu_kernel void @s_test_copysign_f64_10(ptr addrspace(1) %out, [8 x i3
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset0_b32 s3, 31
+; GFX11-NEXT: s_and_b32 s0, s3, 0x7fffffff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -217,13 +217,13 @@ define amdgpu_kernel void @s_test_copysign_f64_neg1(ptr addrspace(1) %out, [8 x
; VI-LABEL: s_test_copysign_f64_neg1:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset1_b32 s3, 31
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_or_b32 s0, s3, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -231,13 +231,13 @@ define amdgpu_kernel void @s_test_copysign_f64_neg1(ptr addrspace(1) %out, [8 x
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset1_b32 s3, 31
+; GFX11-NEXT: s_or_b32 s0, s3, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -263,13 +263,13 @@ define amdgpu_kernel void @s_test_copysign_f64_neg10(ptr addrspace(1) %out, [8 x
; VI-LABEL: s_test_copysign_f64_neg10:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset1_b32 s3, 31
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_or_b32 s0, s3, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -277,13 +277,13 @@ define amdgpu_kernel void @s_test_copysign_f64_neg10(ptr addrspace(1) %out, [8 x
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_bitset1_b32 s3, 31
+; GFX11-NEXT: s_or_b32 s0, s3, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -312,32 +312,32 @@ define amdgpu_kernel void @s_test_copysign_f64_f32(ptr addrspace(1) %out, [8 x i
; VI-LABEL: s_test_copysign_f64_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dword s4, s[0:1], 0x74
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_brev_b32 s5, -2
+; VI-NEXT: s_load_dword s6, s[0:1], 0x74
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s3
-; VI-NEXT: v_mov_b32_e32 v1, s4
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_bfi_b32 v1, s5, v0, v1
+; VI-NEXT: v_mov_b32_e32 v1, s6
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_bfi_b32 v1, s0, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f64_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x2
-; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x74
+; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x74
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v0, s4
+; GFX11-NEXT: v_mov_b32_e32 v0, s6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v0
; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -366,33 +366,33 @@ define amdgpu_kernel void @s_test_copysign_f64_f16(ptr addrspace(1) %out, [8 x i
;
; VI-LABEL: s_test_copysign_f64_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x74
+; VI-NEXT: s_load_dword s6, s[0:1], 0x74
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_brev_b32 s5, -2
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s4
+; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s6
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_bfi_b32 v1, s5, v1, v0
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_bfi_b32 v1, s0, v1, v0
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f64_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x2
-; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x74
+; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x74
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s4
+; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v0
; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -419,24 +419,24 @@ define amdgpu_kernel void @s_test_copysign_f64_0_mag(ptr addrspace(1) %out, doub
;
; VI-LABEL: s_test_copysign_f64_0_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_and_b32 s0, s3, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_and_b32 s0, s7, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f64_0_mag:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000
+; GFX11-NEXT: s_and_b32 s0, s7, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b64 v0, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -463,26 +463,26 @@ define amdgpu_kernel void @s_test_copysign_f64_1_mag(ptr addrspace(1) %out, doub
;
; VI-LABEL: s_test_copysign_f64_1_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_and_b32 s0, s3, 0x80000000
+; VI-NEXT: s_and_b32 s0, s7, 0x80000000
; VI-NEXT: s_or_b32 s0, s0, 0x3ff00000
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f64_1_mag:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000
+; GFX11-NEXT: s_and_b32 s0, s7, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_or_b32 s2, s2, 0x3ff00000
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT: s_or_b32 s0, s0, 0x3ff00000
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b64 v0, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -509,26 +509,26 @@ define amdgpu_kernel void @s_test_copysign_f64_10_mag(ptr addrspace(1) %out, dou
;
; VI-LABEL: s_test_copysign_f64_10_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_and_b32 s0, s3, 0x80000000
+; VI-NEXT: s_and_b32 s0, s7, 0x80000000
; VI-NEXT: s_or_b32 s0, s0, 0x40240000
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f64_10_mag:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000
+; GFX11-NEXT: s_and_b32 s0, s7, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_or_b32 s2, s2, 0x40240000
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT: s_or_b32 s0, s0, 0x40240000
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b64 v0, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -555,26 +555,26 @@ define amdgpu_kernel void @s_test_copysign_f64_neg1_mag(ptr addrspace(1) %out, d
;
; VI-LABEL: s_test_copysign_f64_neg1_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_and_b32 s0, s3, 0x80000000
+; VI-NEXT: s_and_b32 s0, s7, 0x80000000
; VI-NEXT: s_or_b32 s0, s0, 0x3ff00000
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f64_neg1_mag:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000
+; GFX11-NEXT: s_and_b32 s0, s7, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_or_b32 s2, s2, 0x3ff00000
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT: s_or_b32 s0, s0, 0x3ff00000
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b64 v0, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -601,26 +601,26 @@ define amdgpu_kernel void @s_test_copysign_f64_neg10_mag(ptr addrspace(1) %out,
;
; VI-LABEL: s_test_copysign_f64_neg10_mag:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_and_b32 s0, s3, 0x80000000
+; VI-NEXT: s_and_b32 s0, s7, 0x80000000
; VI-NEXT: s_or_b32 s0, s0, 0x40240000
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_copysign_f64_neg10_mag:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s3, 0x80000000
+; GFX11-NEXT: s_and_b32 s0, s7, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_or_b32 s2, s2, 0x40240000
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-NEXT: s_or_b32 s0, s0, 0x40240000
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b64 v0, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -652,19 +652,19 @@ define amdgpu_kernel void @s_test_copysign_v2f64(ptr addrspace(1) %out, <2 x dou
; VI-LABEL: s_test_copysign_v2f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_brev_b32 s2, -2
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_brev_b32 s8, -2
; VI-NEXT: v_mov_b32_e32 v0, s7
; VI-NEXT: v_mov_b32_e32 v1, s11
; VI-NEXT: v_mov_b32_e32 v2, s5
-; VI-NEXT: v_bfi_b32 v3, s2, v0, v1
+; VI-NEXT: v_bfi_b32 v3, s8, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s9
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_bfi_b32 v1, s2, v2, v0
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_bfi_b32 v1, s8, v2, v0
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -672,7 +672,7 @@ define amdgpu_kernel void @s_test_copysign_v2f64(ptr addrspace(1) %out, <2 x dou
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s11
; GFX11-NEXT: v_mov_b32_e32 v2, s9
@@ -681,7 +681,7 @@ define amdgpu_kernel void @s_test_copysign_v2f64(ptr addrspace(1) %out, <2 x dou
; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, s7, v1
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v2
; GFX11-NEXT: v_mov_b32_e32 v2, s6
-; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -718,28 +718,28 @@ define amdgpu_kernel void @s_test_copysign_v3f64(ptr addrspace(1) %out, <3 x dou
; VI-LABEL: s_test_copysign_v3f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_brev_b32 s2, -2
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_brev_b32 s10, -2
; VI-NEXT: v_mov_b32_e32 v0, s7
; VI-NEXT: v_mov_b32_e32 v1, s15
+; VI-NEXT: s_add_u32 s0, s2, 16
; VI-NEXT: v_mov_b32_e32 v2, s5
-; VI-NEXT: v_bfi_b32 v3, s2, v0, v1
+; VI-NEXT: v_bfi_b32 v3, s10, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s13
-; VI-NEXT: v_bfi_b32 v1, s2, v2, v0
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: v_bfi_b32 v1, s10, v2, v0
; VI-NEXT: v_mov_b32_e32 v0, s9
; VI-NEXT: v_mov_b32_e32 v2, s17
-; VI-NEXT: v_bfi_b32 v5, s2, v0, v2
-; VI-NEXT: s_add_u32 s2, s0, 16
-; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v7, s3
+; VI-NEXT: v_mov_b32_e32 v7, s1
+; VI-NEXT: v_bfi_b32 v5, s10, v0, v2
; VI-NEXT: v_mov_b32_e32 v4, s8
-; VI-NEXT: v_mov_b32_e32 v6, s2
+; VI-NEXT: v_mov_b32_e32 v6, s0
; VI-NEXT: flat_store_dwordx2 v[6:7], v[4:5]
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -747,7 +747,7 @@ define amdgpu_kernel void @s_test_copysign_v3f64(ptr addrspace(1) %out, <3 x dou
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x44
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v1, s15
; GFX11-NEXT: v_dual_mov_b32 v5, s17 :: v_dual_mov_b32 v0, s4
@@ -758,8 +758,8 @@ define amdgpu_kernel void @s_test_copysign_v3f64(ptr addrspace(1) %out, <3 x dou
; GFX11-NEXT: v_bfi_b32 v3, 0x7fffffff, s7, v1
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, s5, v7
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16
-; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b64 v6, v[4:5], s[2:3] offset:16
+; GFX11-NEXT: global_store_b128 v6, v[0:3], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -801,32 +801,32 @@ define amdgpu_kernel void @s_test_copysign_v4f64(ptr addrspace(1) %out, <4 x dou
; VI-LABEL: s_test_copysign_v4f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x44
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_brev_b32 s2, -2
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_brev_b32 s12, -2
; VI-NEXT: v_mov_b32_e32 v0, s7
; VI-NEXT: v_mov_b32_e32 v1, s15
; VI-NEXT: v_mov_b32_e32 v2, s5
-; VI-NEXT: v_bfi_b32 v3, s2, v0, v1
+; VI-NEXT: v_bfi_b32 v3, s12, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s13
-; VI-NEXT: v_bfi_b32 v1, s2, v2, v0
+; VI-NEXT: s_add_u32 s0, s2, 16
+; VI-NEXT: v_bfi_b32 v1, s12, v2, v0
; VI-NEXT: v_mov_b32_e32 v0, s11
; VI-NEXT: v_mov_b32_e32 v2, s19
-; VI-NEXT: v_bfi_b32 v7, s2, v0, v2
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: v_bfi_b32 v7, s12, v0, v2
; VI-NEXT: v_mov_b32_e32 v0, s9
; VI-NEXT: v_mov_b32_e32 v2, s17
-; VI-NEXT: v_bfi_b32 v5, s2, v0, v2
-; VI-NEXT: s_add_u32 s2, s0, 16
-; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v9, s3
+; VI-NEXT: v_mov_b32_e32 v9, s1
+; VI-NEXT: v_bfi_b32 v5, s12, v0, v2
; VI-NEXT: v_mov_b32_e32 v4, s8
; VI-NEXT: v_mov_b32_e32 v6, s10
-; VI-NEXT: v_mov_b32_e32 v8, s2
+; VI-NEXT: v_mov_b32_e32 v8, s0
; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -834,7 +834,7 @@ define amdgpu_kernel void @s_test_copysign_v4f64(ptr addrspace(1) %out, <4 x dou
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x44
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s15
; GFX11-NEXT: v_dual_mov_b32 v3, s19 :: v_dual_mov_b32 v2, s10
@@ -848,8 +848,8 @@ define amdgpu_kernel void @s_test_copysign_v4f64(ptr addrspace(1) %out, <4 x dou
; GFX11-NEXT: v_mov_b32_e32 v6, s6
; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, s5, v5
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
-; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1]
+; GFX11-NEXT: global_store_b128 v8, v[0:3], s[2:3] offset:16
+; GFX11-NEXT: global_store_b128 v8, v[4:7], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
index b14b6421f56b4..cfb608c87b7ff 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -47,14 +47,14 @@ define amdgpu_kernel void @v_fdiv_f16(
; GFX8-LABEL: v_fdiv_f16:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_ushort v5, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -111,12 +111,12 @@ define amdgpu_kernel void @v_fdiv_f16(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -178,52 +178,52 @@ define amdgpu_kernel void @v_rcp_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) #
;
; GFX8-LABEL: v_rcp_f16:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_rcp_f16_e32 v3, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_short v[0:1], v3
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_rcp_f16:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_rcp_f16_e32 v1, v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_rcp_f16:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_rcp_f16_e32 v1, v1
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rcp_f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f16_e32 v1, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -272,52 +272,52 @@ define amdgpu_kernel void @v_rcp_f16_abs(ptr addrspace(1) %r, ptr addrspace(1) %
;
; GFX8-LABEL: v_rcp_f16_abs:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_rcp_f16_e64 v3, |v0|
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_short v[0:1], v3
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_rcp_f16_abs:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_rcp_f16_e64 v1, |v1|
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_rcp_f16_abs:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_rcp_f16_e64 v1, |v1|
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rcp_f16_abs:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f16_e64 v1, |v1|
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -369,52 +369,52 @@ define amdgpu_kernel void @reciprocal_f16_rounded(ptr addrspace(1) %r, ptr addrs
;
; GFX8-LABEL: reciprocal_f16_rounded:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_rcp_f16_e32 v3, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_short v[0:1], v3
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: reciprocal_f16_rounded:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_rcp_f16_e32 v1, v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: reciprocal_f16_rounded:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_rcp_f16_e32 v1, v1
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: reciprocal_f16_rounded:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f16_e32 v1, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -450,52 +450,52 @@ define amdgpu_kernel void @v_rcp_f16_afn(ptr addrspace(1) %r, ptr addrspace(1) %
;
; GFX8-LABEL: v_rcp_f16_afn:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_rcp_f16_e32 v3, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_short v[0:1], v3
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_rcp_f16_afn:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_rcp_f16_e32 v1, v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_rcp_f16_afn:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_rcp_f16_e32 v1, v1
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rcp_f16_afn:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f16_e32 v1, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -544,52 +544,52 @@ define amdgpu_kernel void @v_rcp_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) %
;
; GFX8-LABEL: v_rcp_f16_neg:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_rcp_f16_e64 v3, -v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_short v[0:1], v3
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_rcp_f16_neg:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_rcp_f16_e64 v1, -v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_rcp_f16_neg:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_rcp_f16_e64 v1, -v1
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rcp_f16_neg:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f16_e64 v1, -v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -641,52 +641,52 @@ define amdgpu_kernel void @v_rsq_f16(ptr addrspace(1) %r, ptr addrspace(1) %b) #
;
; GFX8-LABEL: v_rsq_f16:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_rsq_f16_e32 v3, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_short v[0:1], v3
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_rsq_f16:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_rsq_f16_e32 v1, v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_rsq_f16:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_rsq_f16_e32 v1, v1
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rsq_f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rsq_f16_e32 v1, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -739,17 +739,17 @@ define amdgpu_kernel void @v_rsq_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) %
;
; GFX8-LABEL: v_rsq_f16_neg:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_rsq_f16_e32 v3, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: v_xor_b32_e32 v2, 0x8000, v3
; GFX8-NEXT: flat_store_short v[0:1], v2
@@ -757,39 +757,39 @@ define amdgpu_kernel void @v_rsq_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) %
;
; GFX9-LABEL: v_rsq_f16_neg:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_rsq_f16_e32 v1, v1
; GFX9-NEXT: v_xor_b32_e32 v1, 0x8000, v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_rsq_f16_neg:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_rsq_f16_e32 v1, v1
; GFX10-NEXT: v_xor_b32_e32 v1, 0x8000, v1
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rsq_f16_neg:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rsq_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -844,16 +844,16 @@ define amdgpu_kernel void @v_rsq_f16_multi_use(ptr addrspace(1) %r, ptr addrspac
;
; GFX8-LABEL: v_rsq_f16_multi_use:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v3, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: v_rsq_f16_e32 v4, v3
; GFX8-NEXT: flat_store_short v[0:1], v3
@@ -863,41 +863,41 @@ define amdgpu_kernel void @v_rsq_f16_multi_use(ptr addrspace(1) %r, ptr addrspac
;
; GFX9-LABEL: v_rsq_f16_multi_use:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_rsq_f16_e32 v2, v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_short v0, v2, s[0:1]
+; GFX9-NEXT: global_store_short v0, v2, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_rsq_f16_multi_use:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_rsq_f16_e32 v2, v1
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_store_short v0, v2, s[0:1]
+; GFX10-NEXT: global_store_short v0, v2, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rsq_f16_multi_use:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rsq_f16_e32 v2, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] dlc
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v2, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -951,57 +951,57 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract0(ptr addrspace(1) %r, ptr
;
; GFX8-LABEL: v_rsq_f16_missing_contract0:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_sqrt_f16_e32 v0, v0
; GFX8-NEXT: v_rcp_f16_e32 v3, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_short v[0:1], v3
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_rsq_f16_missing_contract0:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sqrt_f16_e32 v1, v1
; GFX9-NEXT: v_rcp_f16_e32 v1, v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_rsq_f16_missing_contract0:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_sqrt_f16_e32 v1, v1
; GFX10-NEXT: v_rcp_f16_e32 v1, v1
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rsq_f16_missing_contract0:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_rcp_f16_e32 v1, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1054,57 +1054,57 @@ define amdgpu_kernel void @v_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr
;
; GFX8-LABEL: v_rsq_f16_missing_contract1:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_sqrt_f16_e32 v0, v0
; GFX8-NEXT: v_rcp_f16_e32 v3, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_short v[0:1], v3
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_rsq_f16_missing_contract1:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sqrt_f16_e32 v1, v1
; GFX9-NEXT: v_rcp_f16_e32 v1, v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_rsq_f16_missing_contract1:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_sqrt_f16_e32 v1, v1
; GFX10-NEXT: v_rcp_f16_e32 v1, v1
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_rsq_f16_missing_contract1:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_rcp_f16_e32 v1, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1157,57 +1157,57 @@ define amdgpu_kernel void @v_neg_rsq_f16_missing_contract1(ptr addrspace(1) %r,
;
; GFX8-LABEL: v_neg_rsq_f16_missing_contract1:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_sqrt_f16_e32 v0, v0
; GFX8-NEXT: v_rcp_f16_e64 v3, -v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_store_short v[0:1], v3
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: v_neg_rsq_f16_missing_contract1:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sqrt_f16_e32 v1, v1
; GFX9-NEXT: v_rcp_f16_e64 v1, -v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_neg_rsq_f16_missing_contract1:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_sqrt_f16_e32 v1, v1
; GFX10-NEXT: v_rcp_f16_e64 v1, -v1
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_neg_rsq_f16_missing_contract1:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_rcp_f16_e64 v1, -v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1251,14 +1251,14 @@ define amdgpu_kernel void @v_fdiv_f16_afn(ptr addrspace(1) %r, ptr addrspace(1)
; GFX8-LABEL: v_fdiv_f16_afn:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_ushort v5, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -1307,12 +1307,12 @@ define amdgpu_kernel void @v_fdiv_f16_afn(ptr addrspace(1) %r, ptr addrspace(1)
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f16_e32 v2, v2
; GFX11-NEXT: s_waitcnt_depctr 0xfff
@@ -1362,14 +1362,14 @@ define amdgpu_kernel void @v_fdiv_f16_unsafe(ptr addrspace(1) %r, ptr addrspace(
; GFX8-LABEL: v_fdiv_f16_unsafe:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_ushort v5, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -1418,12 +1418,12 @@ define amdgpu_kernel void @v_fdiv_f16_unsafe(ptr addrspace(1) %r, ptr addrspace(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f16_e32 v2, v2
; GFX11-NEXT: s_waitcnt_depctr 0xfff
@@ -1463,46 +1463,46 @@ define amdgpu_kernel void @div_afn_2_x_pat_f16(ptr addrspace(1) %out) #0 {
; GFX8-LABEL: div_afn_2_x_pat_f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mul_f16_e32 v2, 0.5, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: div_afn_2_x_pat_f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_ushort v0, v[0:1], off
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_f16_e32 v0, 0.5, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v1, v0, s[0:1]
+; GFX9-NEXT: global_store_short v1, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: div_afn_2_x_pat_f16:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_f16_e32 v0, 0.5, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_short v1, v0, s[0:1]
+; GFX10-NEXT: global_store_short v1, v0, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: div_afn_2_x_pat_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_u16 v0, v[0:1], off
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_f16_e32 v0, 0.5, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1530,46 +1530,46 @@ define amdgpu_kernel void @div_afn_k_x_pat_f16(ptr addrspace(1) %out) #0 {
; GFX8-LABEL: div_afn_k_x_pat_f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mul_f16_e32 v2, 0x2e66, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: div_afn_k_x_pat_f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_ushort v0, v[0:1], off
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_f16_e32 v0, 0x2e66, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v1, v0, s[0:1]
+; GFX9-NEXT: global_store_short v1, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: div_afn_k_x_pat_f16:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_f16_e32 v0, 0x2e66, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_short v1, v0, s[0:1]
+; GFX10-NEXT: global_store_short v1, v0, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: div_afn_k_x_pat_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_u16 v0, v[0:1], off
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_f16_e32 v0, 0x2e66, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1597,46 +1597,46 @@ define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(ptr addrspace(1) %out) #0 {
; GFX8-LABEL: div_afn_neg_k_x_pat_f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mul_f16_e32 v2, 0xae66, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: div_afn_neg_k_x_pat_f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_ushort v0, v[0:1], off
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_f16_e32 v0, 0xae66, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_short v1, v0, s[0:1]
+; GFX9-NEXT: global_store_short v1, v0, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: div_afn_neg_k_x_pat_f16:
; GFX10: ; %bb.0:
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_f16_e32 v0, 0xae66, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_short v1, v0, s[0:1]
+; GFX10-NEXT: global_store_short v1, v0, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: div_afn_neg_k_x_pat_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: global_load_u16 v0, v[0:1], off
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_f16_e32 v0, 0xae66, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll
index b63976876c41c..92db799af373a 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll
@@ -1077,7 +1077,6 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX8-LABEL: s_fdiv_v2f32:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s5
; GFX8-NEXT: v_div_scale_f32 v1, s[2:3], s7, s7, v0
@@ -1097,6 +1096,7 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX8-NEXT: v_div_fmas_f32 v1, v1, v3, v5
; GFX8-NEXT: v_mov_b32_e32 v3, s6
; GFX8-NEXT: v_div_scale_f32 v3, vcc, s4, v3, s4
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_rcp_f32_e32 v5, v2
; GFX8-NEXT: v_div_fixup_f32 v1, v1, s7, v0
; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
@@ -1108,8 +1108,9 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX8-NEXT: v_fma_f32 v2, -v2, v5, v3
; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX8-NEXT: v_div_fmas_f32 v0, v2, v0, v5
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: v_div_fixup_f32 v0, v0, s6, v4
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
@@ -1120,7 +1121,6 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_div_scale_f32 v0, s2, s7, s7, s5
; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, s5, s7, s5
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: v_rcp_f32_e32 v1, v0
; GFX10-NEXT: s_denorm_mode 15
; GFX10-NEXT: v_fma_f32 v3, -v0, v1, 1.0
@@ -1132,6 +1132,7 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX10-NEXT: s_denorm_mode 12
; GFX10-NEXT: v_div_scale_f32 v2, s2, s6, s6, s4
; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_rcp_f32_e32 v3, v2
; GFX10-NEXT: v_div_fixup_f32 v1, v0, s7, s5
; GFX10-NEXT: v_div_scale_f32 v0, vcc_lo, s4, s6, s4
@@ -1147,14 +1148,14 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX10-NEXT: v_div_fmas_f32 v0, v0, v3, v4
; GFX10-NEXT: v_div_fixup_f32 v0, v0, s6, s4
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_fdiv_v2f32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_div_scale_f32 v0, null, s7, s7, s5
; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, s5, s7, s5
@@ -1185,7 +1186,7 @@ define amdgpu_kernel void @s_fdiv_v2f32(ptr addrspace(1) %out, <2 x float> %a, <
; GFX11-NEXT: s_denorm_mode 12
; GFX11-NEXT: v_div_fmas_f32 v0, v0, v3, v4
; GFX11-NEXT: v_div_fixup_f32 v0, v0, s6, s4
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1227,12 +1228,12 @@ define amdgpu_kernel void @s_fdiv_ulp25_v2f32(ptr addrspace(1) %out, <2 x float>
; GFX8-LABEL: s_fdiv_ulp25_v2f32:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_rcp_f32_e32 v0, s6
; GFX8-NEXT: v_rcp_f32_e32 v1, s7
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: v_mul_f32_e32 v0, s4, v0
; GFX8-NEXT: v_mul_f32_e32 v1, s5, v1
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -1256,14 +1257,14 @@ define amdgpu_kernel void @s_fdiv_ulp25_v2f32(ptr addrspace(1) %out, <2 x float>
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rcp_f32_e32 v0, s6
; GFX11-NEXT: v_rcp_f32_e32 v1, s7
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1305,14 +1306,14 @@ define amdgpu_kernel void @s_fdiv_v2f32_fast_math(ptr addrspace(1) %out, <2 x fl
; GFX8-LABEL: s_fdiv_v2f32_fast_math:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_rcp_f32_e32 v0, s7
; GFX8-NEXT: v_rcp_f32_e32 v2, s6
; GFX8-NEXT: v_mul_f32_e32 v1, s5, v0
; GFX8-NEXT: v_mul_f32_e32 v0, s4, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
@@ -1334,14 +1335,14 @@ define amdgpu_kernel void @s_fdiv_v2f32_fast_math(ptr addrspace(1) %out, <2 x fl
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rcp_f32_e32 v0, s7
; GFX11-NEXT: v_rcp_f32_e32 v2, s6
; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_dual_mul_f32 v1, s5, v0 :: v_dual_mul_f32 v0, s4, v2
-; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1383,14 +1384,14 @@ define amdgpu_kernel void @s_fdiv_v2f32_arcp_math(ptr addrspace(1) %out, <2 x fl
; GFX8-LABEL: s_fdiv_v2f32_arcp_math:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_rcp_f32_e32 v0, s7
; GFX8-NEXT: v_rcp_f32_e32 v2, s6
; GFX8-NEXT: v_mul_f32_e32 v1, s5, v0
; GFX8-NEXT: v_mul_f32_e32 v0, s4, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
@@ -1412,14 +1413,14 @@ define amdgpu_kernel void @s_fdiv_v2f32_arcp_math(ptr addrspace(1) %out, <2 x fl
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rcp_f32_e32 v0, s7
; GFX11-NEXT: v_rcp_f32_e32 v2, s6
; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_dual_mul_f32 v1, s5, v0 :: v_dual_mul_f32 v0, s4, v2
-; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2179,10 +2180,10 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac
; GFX8-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, 1.0
; GFX8-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_rcp_f32_e32 v2, v0
; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX8-NEXT: v_fma_f32 v3, -v0, v2, 1.0
@@ -2194,18 +2195,19 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac
; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX8-NEXT: v_div_fmas_f32 v0, v0, v2, v3
; GFX8-NEXT: v_div_fixup_f32 v2, v0, s4, 1.0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v0, s3, s2, s2, 1.0
-; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s2, 1.0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: v_div_scale_f32 v0, s2, s4, s4, 1.0
+; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s4, 1.0
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_rcp_f32_e32 v1, v0
; GFX10-NEXT: s_denorm_mode 15
; GFX10-NEXT: v_fma_f32 v3, -v0, v1, 1.0
@@ -2217,19 +2219,19 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac
; GFX10-NEXT: s_denorm_mode 12
; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_div_fixup_f32 v0, v0, s2, 1.0
+; GFX10-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_fdiv_f32_correctly_rounded_divide_sqrt:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_div_scale_f32 v0, null, s2, s2, 1.0
-; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s2, 1.0
+; GFX11-NEXT: v_div_scale_f32 v0, null, s4, s4, 1.0
+; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s4, 1.0
; GFX11-NEXT: v_rcp_f32_e32 v1, v0
; GFX11-NEXT: s_denorm_mode 15
; GFX11-NEXT: s_waitcnt_depctr 0xfff
@@ -2242,8 +2244,8 @@ define amdgpu_kernel void @s_fdiv_f32_correctly_rounded_divide_sqrt(ptr addrspac
; GFX11-NEXT: s_denorm_mode 12
; GFX11-NEXT: v_div_fmas_f32 v0, v0, v1, v3
; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: v_div_fixup_f32 v0, v0, s2, 1.0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2331,10 +2333,10 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr
; GFX8-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, 1.0
; GFX8-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_rcp_f32_e32 v2, v0
; GFX8-NEXT: v_fma_f32 v3, -v0, v2, 1.0
; GFX8-NEXT: v_fma_f32 v2, v3, v2, v2
@@ -2344,52 +2346,53 @@ define amdgpu_kernel void @s_fdiv_f32_denorms_correctly_rounded_divide_sqrt(ptr
; GFX8-NEXT: v_fma_f32 v0, -v0, v3, v1
; GFX8-NEXT: v_div_fmas_f32 v0, v0, v2, v3
; GFX8-NEXT: v_div_fixup_f32 v2, v0, s4, 1.0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v0, s3, s2, s2, 1.0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: v_div_scale_f32 v0, s2, s4, s4, 1.0
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_rcp_f32_e32 v1, v0
; GFX10-NEXT: v_fma_f32 v2, -v0, v1, 1.0
; GFX10-NEXT: v_fmac_f32_e32 v1, v2, v1
-; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s2, 1.0
+; GFX10-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s4, 1.0
; GFX10-NEXT: v_mul_f32_e32 v3, v2, v1
; GFX10-NEXT: v_fma_f32 v4, -v0, v3, v2
; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v1
; GFX10-NEXT: v_fma_f32 v0, -v0, v3, v2
; GFX10-NEXT: v_div_fmas_f32 v0, v0, v1, v3
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_div_fixup_f32 v0, v0, s2, 1.0
+; GFX10-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_fdiv_f32_denorms_correctly_rounded_divide_sqrt:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_div_scale_f32 v0, null, s2, s2, 1.0
+; GFX11-NEXT: v_div_scale_f32 v0, null, s4, s4, 1.0
; GFX11-NEXT: v_rcp_f32_e32 v1, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_fma_f32 v2, -v0, v1, 1.0
; GFX11-NEXT: v_fmac_f32_e32 v1, v2, v1
-; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s2, 1.0
+; GFX11-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, s4, 1.0
; GFX11-NEXT: v_mul_f32_e32 v3, v2, v1
; GFX11-NEXT: v_fma_f32 v4, -v0, v3, v2
; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v1
; GFX11-NEXT: v_fma_f32 v0, -v0, v3, v2
; GFX11-NEXT: v_div_fmas_f32 v0, v0, v1, v3
; GFX11-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-NEXT: v_div_fixup_f32 v0, v0, s2, 1.0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll
index c56b4ae3c34f5..fede468e8219a 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll
@@ -5,28 +5,28 @@
define amdgpu_kernel void @div_1_by_x_25ulp(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_1_by_x_25ulp:
; GCN-DENORM: ; %bb.0:
-; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-DENORM-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v0, s2
+; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v0, s0
; GCN-DENORM-NEXT: v_rcp_f32_e32 v0, v0
-; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v2, s2
+; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v2, s0
; GCN-DENORM-NEXT: v_sub_u32_e32 v2, 0, v2
; GCN-DENORM-NEXT: v_ldexp_f32 v0, v0, v2
-; GCN-DENORM-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-DENORM-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-DENORM-NEXT: s_endpgm
;
; GCN-FLUSH-LABEL: div_1_by_x_25ulp:
; GCN-FLUSH: ; %bb.0:
-; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-FLUSH-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: v_rcp_f32_e32 v0, s2
-; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-FLUSH-NEXT: v_rcp_f32_e32 v0, s0
+; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-FLUSH-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %arg, align 4
%div = fdiv float 1.000000e+00, %load, !fpmath !0
@@ -37,28 +37,28 @@ define amdgpu_kernel void @div_1_by_x_25ulp(ptr addrspace(1) %arg) {
define amdgpu_kernel void @div_minus_1_by_x_25ulp(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_minus_1_by_x_25ulp:
; GCN-DENORM: ; %bb.0:
-; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-DENORM-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: v_frexp_mant_f32_e64 v0, -s2
+; GCN-DENORM-NEXT: v_frexp_mant_f32_e64 v0, -s0
; GCN-DENORM-NEXT: v_rcp_f32_e32 v0, v0
-; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v2, s2
+; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v2, s0
; GCN-DENORM-NEXT: v_sub_u32_e32 v2, 0, v2
; GCN-DENORM-NEXT: v_ldexp_f32 v0, v0, v2
-; GCN-DENORM-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-DENORM-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-DENORM-NEXT: s_endpgm
;
; GCN-FLUSH-LABEL: div_minus_1_by_x_25ulp:
; GCN-FLUSH: ; %bb.0:
-; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-FLUSH-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: v_rcp_f32_e64 v0, -s2
-; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-FLUSH-NEXT: v_rcp_f32_e64 v0, -s0
+; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-FLUSH-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %arg, align 4
%div = fdiv float -1.000000e+00, %load, !fpmath !0
@@ -69,28 +69,28 @@ define amdgpu_kernel void @div_minus_1_by_x_25ulp(ptr addrspace(1) %arg) {
define amdgpu_kernel void @div_1_by_minus_x_25ulp(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_1_by_minus_x_25ulp:
; GCN-DENORM: ; %bb.0:
-; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-DENORM-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: v_frexp_mant_f32_e64 v0, -s2
+; GCN-DENORM-NEXT: v_frexp_mant_f32_e64 v0, -s0
; GCN-DENORM-NEXT: v_rcp_f32_e32 v0, v0
-; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v2, s2
+; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v2, s0
; GCN-DENORM-NEXT: v_sub_u32_e32 v2, 0, v2
; GCN-DENORM-NEXT: v_ldexp_f32 v0, v0, v2
-; GCN-DENORM-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-DENORM-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-DENORM-NEXT: s_endpgm
;
; GCN-FLUSH-LABEL: div_1_by_minus_x_25ulp:
; GCN-FLUSH: ; %bb.0:
-; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-FLUSH-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: v_rcp_f32_e64 v0, -s2
-; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-FLUSH-NEXT: v_rcp_f32_e64 v0, -s0
+; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-FLUSH-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %arg, align 4
%neg = fneg float %load
@@ -102,28 +102,28 @@ define amdgpu_kernel void @div_1_by_minus_x_25ulp(ptr addrspace(1) %arg) {
define amdgpu_kernel void @div_minus_1_by_minus_x_25ulp(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_minus_1_by_minus_x_25ulp:
; GCN-DENORM: ; %bb.0:
-; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-DENORM-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v0, s2
+; GCN-DENORM-NEXT: v_frexp_mant_f32_e32 v0, s0
; GCN-DENORM-NEXT: v_rcp_f32_e32 v0, v0
-; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v2, s2
+; GCN-DENORM-NEXT: v_frexp_exp_i32_f32_e32 v2, s0
; GCN-DENORM-NEXT: v_sub_u32_e32 v2, 0, v2
; GCN-DENORM-NEXT: v_ldexp_f32 v0, v0, v2
-; GCN-DENORM-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-DENORM-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-DENORM-NEXT: s_endpgm
;
; GCN-FLUSH-LABEL: div_minus_1_by_minus_x_25ulp:
; GCN-FLUSH: ; %bb.0:
-; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-FLUSH-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: v_rcp_f32_e32 v0, s2
-; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-FLUSH-NEXT: v_rcp_f32_e32 v0, s0
+; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-FLUSH-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %arg, align 4
%neg = fsub float -0.000000e+00, %load
@@ -512,13 +512,13 @@ define amdgpu_kernel void @div_v_by_x_25ulp(ptr addrspace(1) %arg, float %num) {
define amdgpu_kernel void @div_1_by_x_fast(ptr addrspace(1) %arg) {
; GCN-LABEL: div_1_by_x_fast:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_rcp_f32_e32 v0, s2
-; GCN-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-NEXT: v_rcp_f32_e32 v0, s0
+; GCN-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %arg, align 4
%div = fdiv fast float 1.000000e+00, %load, !fpmath !0
@@ -529,25 +529,25 @@ define amdgpu_kernel void @div_1_by_x_fast(ptr addrspace(1) %arg) {
define amdgpu_kernel void @div_minus_1_by_x_fast(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_minus_1_by_x_fast:
; GCN-DENORM: ; %bb.0:
-; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-DENORM-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: v_rcp_f32_e64 v0, -s2
-; GCN-DENORM-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-DENORM-NEXT: v_rcp_f32_e64 v0, -s0
+; GCN-DENORM-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-DENORM-NEXT: s_endpgm
;
; GCN-FLUSH-LABEL: div_minus_1_by_x_fast:
; GCN-FLUSH: ; %bb.0:
-; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-FLUSH-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: v_rcp_f32_e32 v0, s2
+; GCN-FLUSH-NEXT: v_rcp_f32_e32 v0, s0
; GCN-FLUSH-NEXT: v_sub_f32_e32 v0, 0x80000000, v0
-; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-FLUSH-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %arg, align 4
%div = fdiv fast float -1.000000e+00, %load, !fpmath !0
@@ -558,13 +558,13 @@ define amdgpu_kernel void @div_minus_1_by_x_fast(ptr addrspace(1) %arg) {
define amdgpu_kernel void @div_1_by_minus_x_fast(ptr addrspace(1) %arg) {
; GCN-LABEL: div_1_by_minus_x_fast:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_rcp_f32_e64 v0, -s2
-; GCN-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-NEXT: v_rcp_f32_e64 v0, -s0
+; GCN-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %arg, align 4
%neg = fneg float %load, !fpmath !0
@@ -576,25 +576,25 @@ define amdgpu_kernel void @div_1_by_minus_x_fast(ptr addrspace(1) %arg) {
define amdgpu_kernel void @div_minus_1_by_minus_x_fast(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_minus_1_by_minus_x_fast:
; GCN-DENORM: ; %bb.0:
-; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-DENORM-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: v_rcp_f32_e32 v0, s2
-; GCN-DENORM-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-DENORM-NEXT: v_rcp_f32_e32 v0, s0
+; GCN-DENORM-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-DENORM-NEXT: s_endpgm
;
; GCN-FLUSH-LABEL: div_minus_1_by_minus_x_fast:
; GCN-FLUSH: ; %bb.0:
-; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: s_load_dword s2, s[0:1], 0x0
+; GCN-FLUSH-NEXT: s_load_dword s0, s[2:3], 0x0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: v_rcp_f32_e64 v0, -s2
+; GCN-FLUSH-NEXT: v_rcp_f32_e64 v0, -s0
; GCN-FLUSH-NEXT: v_sub_f32_e32 v0, 0x80000000, v0
-; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-FLUSH-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %arg, align 4
%neg = fsub float -0.000000e+00, %load, !fpmath !0
@@ -606,11 +606,11 @@ define amdgpu_kernel void @div_minus_1_by_minus_x_fast(ptr addrspace(1) %arg) {
define amdgpu_kernel void @div_1_by_x_correctly_rounded(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_1_by_x_correctly_rounded:
; GCN-DENORM: ; %bb.0:
-; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: s_load_dword s4, s[0:1], 0x0
+; GCN-DENORM-NEXT: s_load_dword s4, s[2:3], 0x0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, 1.0
+; GCN-DENORM-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, 1.0
; GCN-DENORM-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0
; GCN-DENORM-NEXT: v_rcp_f32_e32 v2, v0
; GCN-DENORM-NEXT: v_fma_f32 v3, -v0, v2, 1.0
@@ -622,16 +622,16 @@ define amdgpu_kernel void @div_1_by_x_correctly_rounded(ptr addrspace(1) %arg) {
; GCN-DENORM-NEXT: v_div_fmas_f32 v0, v0, v2, v3
; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0
-; GCN-DENORM-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-DENORM-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-DENORM-NEXT: s_endpgm
;
; GCN-FLUSH-LABEL: div_1_by_x_correctly_rounded:
; GCN-FLUSH: ; %bb.0:
-; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: s_load_dword s4, s[0:1], 0x0
+; GCN-FLUSH-NEXT: s_load_dword s4, s[2:3], 0x0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, 1.0
+; GCN-FLUSH-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, 1.0
; GCN-FLUSH-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0
; GCN-FLUSH-NEXT: v_rcp_f32_e32 v2, v0
; GCN-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
@@ -645,7 +645,7 @@ define amdgpu_kernel void @div_1_by_x_correctly_rounded(ptr addrspace(1) %arg) {
; GCN-FLUSH-NEXT: v_div_fmas_f32 v0, v0, v2, v3
; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0
-; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-FLUSH-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %arg, align 4
%div = fdiv float 1.000000e+00, %load
@@ -656,11 +656,11 @@ define amdgpu_kernel void @div_1_by_x_correctly_rounded(ptr addrspace(1) %arg) {
define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_minus_1_by_x_correctly_rounded:
; GCN-DENORM: ; %bb.0:
-; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: s_load_dword s4, s[0:1], 0x0
+; GCN-DENORM-NEXT: s_load_dword s4, s[2:3], 0x0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, -1.0
+; GCN-DENORM-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, -1.0
; GCN-DENORM-NEXT: v_div_scale_f32 v1, vcc, -1.0, s4, -1.0
; GCN-DENORM-NEXT: v_rcp_f32_e32 v2, v0
; GCN-DENORM-NEXT: v_fma_f32 v3, -v0, v2, 1.0
@@ -672,16 +672,16 @@ define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(ptr addrspace(1) %
; GCN-DENORM-NEXT: v_div_fmas_f32 v0, v0, v2, v3
; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT: v_div_fixup_f32 v0, v0, s4, -1.0
-; GCN-DENORM-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-DENORM-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-DENORM-NEXT: s_endpgm
;
; GCN-FLUSH-LABEL: div_minus_1_by_x_correctly_rounded:
; GCN-FLUSH: ; %bb.0:
-; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: s_load_dword s4, s[0:1], 0x0
+; GCN-FLUSH-NEXT: s_load_dword s4, s[2:3], 0x0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, -1.0
+; GCN-FLUSH-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, -1.0
; GCN-FLUSH-NEXT: v_div_scale_f32 v1, vcc, -1.0, s4, -1.0
; GCN-FLUSH-NEXT: v_rcp_f32_e32 v2, v0
; GCN-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
@@ -695,7 +695,7 @@ define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(ptr addrspace(1) %
; GCN-FLUSH-NEXT: v_div_fmas_f32 v0, v0, v2, v3
; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT: v_div_fixup_f32 v0, v0, s4, -1.0
-; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-FLUSH-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %arg, align 4
%div = fdiv float -1.000000e+00, %load
@@ -706,11 +706,11 @@ define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(ptr addrspace(1) %
define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_1_by_minus_x_correctly_rounded:
; GCN-DENORM: ; %bb.0:
-; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: s_load_dword s4, s[0:1], 0x0
+; GCN-DENORM-NEXT: s_load_dword s4, s[2:3], 0x0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, -1.0
+; GCN-DENORM-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, -1.0
; GCN-DENORM-NEXT: v_div_scale_f32 v1, vcc, -1.0, s4, -1.0
; GCN-DENORM-NEXT: v_rcp_f32_e32 v2, v0
; GCN-DENORM-NEXT: v_fma_f32 v3, -v0, v2, 1.0
@@ -722,16 +722,16 @@ define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(ptr addrspace(1) %
; GCN-DENORM-NEXT: v_div_fmas_f32 v0, v0, v2, v3
; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT: v_div_fixup_f32 v0, v0, s4, -1.0
-; GCN-DENORM-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-DENORM-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-DENORM-NEXT: s_endpgm
;
; GCN-FLUSH-LABEL: div_1_by_minus_x_correctly_rounded:
; GCN-FLUSH: ; %bb.0:
-; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: s_load_dword s4, s[0:1], 0x0
+; GCN-FLUSH-NEXT: s_load_dword s4, s[2:3], 0x0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: v_div_scale_f32 v0, s[2:3], -s4, -s4, 1.0
+; GCN-FLUSH-NEXT: v_div_scale_f32 v0, s[0:1], -s4, -s4, 1.0
; GCN-FLUSH-NEXT: v_div_scale_f32 v1, vcc, 1.0, -s4, 1.0
; GCN-FLUSH-NEXT: v_rcp_f32_e32 v2, v0
; GCN-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
@@ -745,7 +745,7 @@ define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(ptr addrspace(1) %
; GCN-FLUSH-NEXT: v_div_fmas_f32 v0, v0, v2, v3
; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT: v_div_fixup_f32 v0, v0, -s4, 1.0
-; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-FLUSH-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %arg, align 4
%neg = fsub float -0.000000e+00, %load
@@ -757,11 +757,11 @@ define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(ptr addrspace(1) %
define amdgpu_kernel void @div_minus_1_by_minus_x_correctly_rounded(ptr addrspace(1) %arg) {
; GCN-DENORM-LABEL: div_minus_1_by_minus_x_correctly_rounded:
; GCN-DENORM: ; %bb.0:
-; GCN-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: s_load_dword s4, s[0:1], 0x0
+; GCN-DENORM-NEXT: s_load_dword s4, s[2:3], 0x0
; GCN-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DENORM-NEXT: v_div_scale_f32 v0, s[2:3], s4, s4, 1.0
+; GCN-DENORM-NEXT: v_div_scale_f32 v0, s[0:1], s4, s4, 1.0
; GCN-DENORM-NEXT: v_div_scale_f32 v1, vcc, 1.0, s4, 1.0
; GCN-DENORM-NEXT: v_rcp_f32_e32 v2, v0
; GCN-DENORM-NEXT: v_fma_f32 v3, -v0, v2, 1.0
@@ -773,16 +773,16 @@ define amdgpu_kernel void @div_minus_1_by_minus_x_correctly_rounded(ptr addrspac
; GCN-DENORM-NEXT: v_div_fmas_f32 v0, v0, v2, v3
; GCN-DENORM-NEXT: v_mov_b32_e32 v1, 0
; GCN-DENORM-NEXT: v_div_fixup_f32 v0, v0, s4, 1.0
-; GCN-DENORM-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-DENORM-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-DENORM-NEXT: s_endpgm
;
; GCN-FLUSH-LABEL: div_minus_1_by_minus_x_correctly_rounded:
; GCN-FLUSH: ; %bb.0:
-; GCN-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: s_load_dword s4, s[0:1], 0x0
+; GCN-FLUSH-NEXT: s_load_dword s4, s[2:3], 0x0
; GCN-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-FLUSH-NEXT: v_div_scale_f32 v0, s[2:3], -s4, -s4, -1.0
+; GCN-FLUSH-NEXT: v_div_scale_f32 v0, s[0:1], -s4, -s4, -1.0
; GCN-FLUSH-NEXT: v_div_scale_f32 v1, vcc, -1.0, -s4, -1.0
; GCN-FLUSH-NEXT: v_rcp_f32_e32 v2, v0
; GCN-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
@@ -796,7 +796,7 @@ define amdgpu_kernel void @div_minus_1_by_minus_x_correctly_rounded(ptr addrspac
; GCN-FLUSH-NEXT: v_div_fmas_f32 v0, v0, v2, v3
; GCN-FLUSH-NEXT: v_mov_b32_e32 v1, 0
; GCN-FLUSH-NEXT: v_div_fixup_f32 v0, v0, -s4, -1.0
-; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-FLUSH-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-FLUSH-NEXT: s_endpgm
%load = load float, ptr addrspace(1) %arg, align 4
%neg = fsub float -0.000000e+00, %load
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
index ab3650f5650ac..e0abaa62cfc5a 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
@@ -4354,14 +4354,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i3
; GCN2-LABEL: atomic_cmpxchg_i32_ret_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s2, s4, 16
-; GCN2-NEXT: s_addc_u32 s3, s5, 0
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
+; GCN2-NEXT: s_add_u32 s0, s4, 16
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s0
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -4606,12 +4606,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr %out, ptr %out2, i32 %in,
; GCN2-LABEL: atomic_cmpxchg_i32_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -5263,31 +5263,31 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_i32_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 16
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 16
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_i32_offset:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v0, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
@@ -5314,29 +5314,29 @@ define amdgpu_kernel void @atomic_load_i32(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_i32:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_i32:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v0, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v2, v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
@@ -5368,12 +5368,12 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr %in, ptr %out, i64
;
; GCN2-LABEL: atomic_load_i32_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
@@ -5381,8 +5381,8 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr %in, ptr %out, i64
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
@@ -5432,19 +5432,19 @@ define amdgpu_kernel void @atomic_load_i32_addr64(ptr %in, ptr %out, i64 %index)
;
; GCN2-LABEL: atomic_load_i32_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
@@ -5673,31 +5673,31 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_f32_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 16
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 16
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_f32_offset:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v0, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
@@ -5724,29 +5724,29 @@ define amdgpu_kernel void @atomic_load_f32(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_f32:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_f32:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v0, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dword v2, v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dword v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
@@ -5778,12 +5778,12 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr %in, ptr %out, i64
;
; GCN2-LABEL: atomic_load_f32_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
@@ -5791,8 +5791,8 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr %in, ptr %out, i64
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
@@ -5842,19 +5842,19 @@ define amdgpu_kernel void @atomic_load_f32_addr64(ptr %in, ptr %out, i64 %index)
;
; GCN2-LABEL: atomic_load_f32_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
@@ -6083,31 +6083,31 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_i8_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 16
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 16
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_ubyte v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_byte v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_i8_offset:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v0, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ubyte v2, v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_byte v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
@@ -6134,29 +6134,29 @@ define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_i8:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_ubyte v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_byte v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_i8:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v0, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ubyte v2, v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_byte v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
@@ -6188,10 +6188,10 @@ define amdgpu_kernel void @atomic_load_i8_addr64_offset(ptr %in, ptr %out, i64 %
; GCN2-LABEL: atomic_load_i8_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s4, s0
-; GCN2-NEXT: s_addc_u32 s1, s5, s1
+; GCN2-NEXT: s_add_u32 s0, s4, s2
+; GCN2-NEXT: s_addc_u32 s1, s5, s3
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
@@ -6378,31 +6378,31 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_i16_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 16
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 16
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_short v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_i16_offset:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v0, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
@@ -6429,29 +6429,29 @@ define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_i16:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_short v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_i16:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v0, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
entry:
@@ -6483,12 +6483,12 @@ define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64
;
; GCN2-LABEL: atomic_load_i16_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 1
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 1
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
@@ -6496,8 +6496,8 @@ define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64
; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_short v[0:1], v2
; GCN2-NEXT: s_endpgm
;
@@ -7963,31 +7963,31 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_f16_offset:
; GCN2: ; %bb.0:
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 16
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 16
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_short v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_f16_offset:
; GCN3: ; %bb.0:
-; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v0, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
%gep = getelementptr half, ptr %in, i64 8
@@ -8013,29 +8013,29 @@ define amdgpu_kernel void @atomic_load_f16(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_f16:
; GCN2: ; %bb.0:
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_short v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_f16:
; GCN3: ; %bb.0:
-; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v0, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
%val = load atomic half, ptr %in seq_cst, align 2
@@ -8062,31 +8062,31 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_bf16_offset:
; GCN2: ; %bb.0:
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 16
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 16
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_short v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_bf16_offset:
; GCN3: ; %bb.0:
-; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v0, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
%gep = getelementptr bfloat, ptr %in, i64 8
@@ -8112,29 +8112,29 @@ define amdgpu_kernel void @atomic_load_bf16(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_bf16:
; GCN2: ; %bb.0:
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_short v[0:1], v2
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_load_bf16:
; GCN3: ; %bb.0:
-; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v0, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_ushort v2, v[0:1] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_short v[0:1], v2
; GCN3-NEXT: s_endpgm
%val = load atomic bfloat, ptr %in seq_cst, align 2
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
index 816142dd17cce..1d204ac2ce700 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
@@ -3953,14 +3953,14 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2,
;
; GCN2-LABEL: atomic_max_i32_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_ashr_i32 s7, s5, 31
-; GCN2-NEXT: s_mov_b32 s6, s5
-; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
-; GCN2-NEXT: s_add_u32 s0, s0, s6
-; GCN2-NEXT: s_addc_u32 s1, s1, s7
+; GCN2-NEXT: s_ashr_i32 s1, s3, 31
+; GCN2-NEXT: s_mov_b32 s0, s3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
@@ -3971,7 +3971,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2,
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v3, v2
-; GCN2-NEXT: v_max_i32_e32 v2, s4, v3
+; GCN2-NEXT: v_max_i32_e32 v2, s2, v3
; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -3981,8 +3981,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2,
; GCN2-NEXT: s_cbranch_execnz .LBB89_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
@@ -4152,14 +4152,14 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
;
; GCN2-LABEL: atomic_max_i32_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_ashr_i32 s7, s5, 31
-; GCN2-NEXT: s_mov_b32 s6, s5
-; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
-; GCN2-NEXT: s_add_u32 s0, s0, s6
-; GCN2-NEXT: s_addc_u32 s1, s1, s7
+; GCN2-NEXT: s_ashr_i32 s1, s3, 31
+; GCN2-NEXT: s_mov_b32 s0, s3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v2, v[0:1]
@@ -4168,7 +4168,7 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v3, v2
-; GCN2-NEXT: v_max_i32_e32 v2, s4, v3
+; GCN2-NEXT: v_max_i32_e32 v2, s2, v3
; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -4178,8 +4178,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
; GCN2-NEXT: s_cbranch_execnz .LBB91_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
@@ -5096,14 +5096,14 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2
;
; GCN2-LABEL: atomic_umax_i32_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_ashr_i32 s7, s5, 31
-; GCN2-NEXT: s_mov_b32 s6, s5
-; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
-; GCN2-NEXT: s_add_u32 s0, s0, s6
-; GCN2-NEXT: s_addc_u32 s1, s1, s7
+; GCN2-NEXT: s_ashr_i32 s1, s3, 31
+; GCN2-NEXT: s_mov_b32 s0, s3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
@@ -5114,7 +5114,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v3, v2
-; GCN2-NEXT: v_max_u32_e32 v2, s4, v3
+; GCN2-NEXT: v_max_u32_e32 v2, s2, v3
; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -5124,8 +5124,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2
; GCN2-NEXT: s_cbranch_execnz .LBB103_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
@@ -5205,14 +5205,14 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %
;
; GCN2-LABEL: atomic_umax_i32_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_ashr_i32 s7, s5, 31
-; GCN2-NEXT: s_mov_b32 s6, s5
-; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
-; GCN2-NEXT: s_add_u32 s0, s0, s6
-; GCN2-NEXT: s_addc_u32 s1, s1, s7
+; GCN2-NEXT: s_ashr_i32 s1, s3, 31
+; GCN2-NEXT: s_mov_b32 s0, s3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v2, v[0:1]
@@ -5221,7 +5221,7 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v3, v2
-; GCN2-NEXT: v_max_u32_e32 v2, s4, v3
+; GCN2-NEXT: v_max_u32_e32 v2, s2, v3
; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -5231,8 +5231,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %
; GCN2-NEXT: s_cbranch_execnz .LBB104_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
@@ -6890,14 +6890,14 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2,
;
; GCN2-LABEL: atomic_min_i32_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_ashr_i32 s7, s5, 31
-; GCN2-NEXT: s_mov_b32 s6, s5
-; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
-; GCN2-NEXT: s_add_u32 s0, s0, s6
-; GCN2-NEXT: s_addc_u32 s1, s1, s7
+; GCN2-NEXT: s_ashr_i32 s1, s3, 31
+; GCN2-NEXT: s_mov_b32 s0, s3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 16
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
@@ -6908,7 +6908,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2,
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v3, v2
-; GCN2-NEXT: v_min_i32_e32 v2, s4, v3
+; GCN2-NEXT: v_min_i32_e32 v2, s2, v3
; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -6918,8 +6918,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2,
; GCN2-NEXT: s_cbranch_execnz .LBB126_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
@@ -7076,14 +7076,14 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
;
; GCN2-LABEL: atomic_min_i32_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_ashr_i32 s7, s5, 31
-; GCN2-NEXT: s_mov_b32 s6, s5
-; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
-; GCN2-NEXT: s_add_u32 s0, s0, s6
-; GCN2-NEXT: s_addc_u32 s1, s1, s7
+; GCN2-NEXT: s_ashr_i32 s1, s3, 31
+; GCN2-NEXT: s_mov_b32 s0, s3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dword v2, v[0:1]
@@ -7092,7 +7092,7 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v3, v2
-; GCN2-NEXT: v_min_i32_e32 v2, s4, v3
+; GCN2-NEXT: v_min_i32_e32 v2, s2, v3
; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -7102,8 +7102,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %i
; GCN2-NEXT: s_cbranch_execnz .LBB128_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dword v[0:1], v2
; GCN2-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
index b8c8d993d389b..fa5a0dbf3409c 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
@@ -21,13 +21,13 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_add_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -36,10 +36,10 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_add_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -72,20 +72,20 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_add_i64_ret_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
@@ -93,10 +93,10 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -133,10 +133,10 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GCN2-LABEL: atomic_add_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -152,12 +152,12 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GFX12-LABEL: atomic_add_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1] offset:32
@@ -195,38 +195,38 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2,
;
; GCN2-LABEL: atomic_add_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_add_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -253,12 +253,12 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_add_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -266,10 +266,10 @@ define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_add_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -300,12 +300,12 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-LABEL: atomic_add_i64_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -318,10 +318,10 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -355,10 +355,10 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN2-LABEL: atomic_add_i64_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -372,12 +372,12 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-LABEL: atomic_add_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1]
@@ -412,36 +412,36 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_add_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_add_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -469,13 +469,13 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_and_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_and_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -484,10 +484,10 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_and_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -520,20 +520,20 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_and_i64_ret_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
@@ -541,10 +541,10 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -581,10 +581,10 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GCN2-LABEL: atomic_and_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -600,12 +600,12 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GFX12-LABEL: atomic_and_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_and_b64 v[2:3], v[0:1] offset:32
@@ -643,38 +643,38 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2,
;
; GCN2-LABEL: atomic_and_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_and_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -701,12 +701,12 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_and_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -714,10 +714,10 @@ define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_and_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -748,12 +748,12 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-LABEL: atomic_and_i64_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -766,10 +766,10 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -803,10 +803,10 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN2-LABEL: atomic_and_i64_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -820,12 +820,12 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-LABEL: atomic_and_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_and_b64 v[2:3], v[0:1]
@@ -860,36 +860,36 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_and_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_and_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -917,13 +917,13 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_sub_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -932,10 +932,10 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_sub_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -968,20 +968,20 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_sub_i64_ret_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
@@ -989,10 +989,10 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1029,10 +1029,10 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GCN2-LABEL: atomic_sub_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -1048,12 +1048,12 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GFX12-LABEL: atomic_sub_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_sub_u64 v[2:3], v[0:1] offset:32
@@ -1091,38 +1091,38 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2,
;
; GCN2-LABEL: atomic_sub_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_sub_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -1149,12 +1149,12 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_sub_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -1162,10 +1162,10 @@ define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_sub_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1196,12 +1196,12 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-LABEL: atomic_sub_i64_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -1214,10 +1214,10 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -1251,10 +1251,10 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN2-LABEL: atomic_sub_i64_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -1268,12 +1268,12 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-LABEL: atomic_sub_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_sub_u64 v[2:3], v[0:1]
@@ -1308,36 +1308,36 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_sub_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_sub_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -1364,13 +1364,13 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_max_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -1378,10 +1378,10 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_max_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -1414,19 +1414,19 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_max_i64_ret_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
@@ -1435,10 +1435,10 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -1474,10 +1474,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GCN2-LABEL: atomic_max_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -1492,12 +1492,12 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GFX12-LABEL: atomic_max_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_max_i64 v[2:3], v[0:1] offset:32
@@ -1535,38 +1535,38 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
;
; GCN2-LABEL: atomic_max_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_max_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -1592,22 +1592,22 @@ define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_max_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_max_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -1638,12 +1638,12 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-LABEL: atomic_max_i64_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v2, s6
@@ -1656,10 +1656,10 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -1692,10 +1692,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN2-LABEL: atomic_max_i64_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -1708,12 +1708,12 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-LABEL: atomic_max_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_max_i64 v[2:3], v[0:1]
@@ -1748,36 +1748,36 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_max_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_max_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -1804,13 +1804,13 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_umax_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -1818,10 +1818,10 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_umax_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -1854,19 +1854,19 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %
;
; GCN2-LABEL: atomic_umax_i64_ret_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
@@ -1875,10 +1875,10 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -1914,10 +1914,10 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64
; GCN2-LABEL: atomic_umax_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -1932,12 +1932,12 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64
; GFX12-LABEL: atomic_umax_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_max_u64 v[2:3], v[0:1] offset:32
@@ -1975,38 +1975,38 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
;
; GCN2-LABEL: atomic_umax_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umax_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -2032,22 +2032,22 @@ define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_umax_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umax_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2078,12 +2078,12 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-LABEL: atomic_umax_i64_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v2, s6
@@ -2096,10 +2096,10 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2132,10 +2132,10 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN2-LABEL: atomic_umax_i64_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -2148,12 +2148,12 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-LABEL: atomic_umax_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_max_u64 v[2:3], v[0:1]
@@ -2188,36 +2188,36 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
;
; GCN2-LABEL: atomic_umax_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umax_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -2244,13 +2244,13 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_min_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -2258,10 +2258,10 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_min_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2294,19 +2294,19 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_min_i64_ret_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
@@ -2315,10 +2315,10 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2354,10 +2354,10 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GCN2-LABEL: atomic_min_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -2372,12 +2372,12 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GFX12-LABEL: atomic_min_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_min_i64 v[2:3], v[0:1] offset:32
@@ -2415,38 +2415,38 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
;
; GCN2-LABEL: atomic_min_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_min_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -2472,22 +2472,22 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_min_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_min_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2518,12 +2518,12 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-LABEL: atomic_min_i64_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v2, s6
@@ -2536,10 +2536,10 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2572,10 +2572,10 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN2-LABEL: atomic_min_i64_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -2588,12 +2588,12 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-LABEL: atomic_min_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_min_i64 v[2:3], v[0:1]
@@ -2628,36 +2628,36 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_min_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_min_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -2684,13 +2684,13 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_umin_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
@@ -2698,10 +2698,10 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_umin_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2734,19 +2734,19 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %
;
; GCN2-LABEL: atomic_umin_i64_ret_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
@@ -2755,10 +2755,10 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2794,10 +2794,10 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64
; GCN2-LABEL: atomic_umin_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -2812,12 +2812,12 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64
; GFX12-LABEL: atomic_umin_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_min_u64 v[2:3], v[0:1] offset:32
@@ -2855,38 +2855,38 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2
;
; GCN2-LABEL: atomic_umin_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umin_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -2912,22 +2912,22 @@ define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_umin_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umin_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -2958,12 +2958,12 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-LABEL: atomic_umin_i64_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v2, s6
@@ -2976,10 +2976,10 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
@@ -3012,10 +3012,10 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN2-LABEL: atomic_umin_i64_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -3028,12 +3028,12 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-LABEL: atomic_umin_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_min_u64 v[2:3], v[0:1]
@@ -3068,36 +3068,36 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 %
;
; GCN2-LABEL: atomic_umin_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: s_waitcnt vmcnt(0)
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umin_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -3125,13 +3125,13 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_or_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_or_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3140,10 +3140,10 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_or_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3176,20 +3176,20 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in
;
; GCN2-LABEL: atomic_or_i64_ret_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
@@ -3197,10 +3197,10 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3237,10 +3237,10 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i
; GCN2-LABEL: atomic_or_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -3256,12 +3256,12 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %i
; GFX12-LABEL: atomic_or_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_or_b64 v[2:3], v[0:1] offset:32
@@ -3299,38 +3299,38 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2,
;
; GCN2-LABEL: atomic_or_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_or_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -3357,12 +3357,12 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_or_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -3370,10 +3370,10 @@ define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_or_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3404,12 +3404,12 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-LABEL: atomic_or_i64_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -3422,10 +3422,10 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3459,10 +3459,10 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) {
; GCN2-LABEL: atomic_or_i64_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -3476,12 +3476,12 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) {
; GFX12-LABEL: atomic_or_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_or_b64 v[2:3], v[0:1]
@@ -3516,36 +3516,36 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in
;
; GCN2-LABEL: atomic_or_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_or_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -3573,13 +3573,13 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_xchg_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3588,10 +3588,10 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_xchg_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3620,13 +3620,13 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) {
;
; GCN2-LABEL: atomic_xchg_f64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3635,10 +3635,10 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) {
;
; GFX12-LABEL: atomic_xchg_f64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3667,13 +3667,13 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) {
;
; GCN2-LABEL: atomic_xchg_pointer_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3682,10 +3682,10 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) {
;
; GFX12-LABEL: atomic_xchg_pointer_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3718,20 +3718,20 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 %
;
; GCN2-LABEL: atomic_xchg_i64_ret_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
@@ -3739,10 +3739,10 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 %
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3779,10 +3779,10 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64
; GCN2-LABEL: atomic_xchg_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -3798,12 +3798,12 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64
; GFX12-LABEL: atomic_xchg_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1] offset:32
@@ -3841,38 +3841,38 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2
;
; GCN2-LABEL: atomic_xchg_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xchg_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -3899,12 +3899,12 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_xchg_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -3912,10 +3912,10 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_xchg_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -3946,12 +3946,12 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-LABEL: atomic_xchg_i64_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -3964,10 +3964,10 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4001,10 +4001,10 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN2-LABEL: atomic_xchg_i64_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -4018,12 +4018,12 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-LABEL: atomic_xchg_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_swap_b64 v[2:3], v[0:1]
@@ -4058,36 +4058,36 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %
;
; GCN2-LABEL: atomic_xchg_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xchg_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -4115,13 +4115,13 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_xor_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -4130,10 +4130,10 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_xor_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4166,20 +4166,20 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_xor_i64_ret_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
@@ -4187,10 +4187,10 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4227,10 +4227,10 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GCN2-LABEL: atomic_xor_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -4246,12 +4246,12 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 %
; GFX12-LABEL: atomic_xor_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_xor_b64 v[2:3], v[0:1] offset:32
@@ -4289,38 +4289,38 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2,
;
; GCN2-LABEL: atomic_xor_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xor_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -4347,12 +4347,12 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_xor_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -4360,10 +4360,10 @@ define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_xor_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4394,12 +4394,12 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-LABEL: atomic_xor_i64_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -4412,10 +4412,10 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4449,10 +4449,10 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index)
; GCN2-LABEL: atomic_xor_i64_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -4466,12 +4466,12 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index)
; GFX12-LABEL: atomic_xor_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_xor_b64 v[2:3], v[0:1]
@@ -4506,36 +4506,36 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_xor_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xor_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -4564,26 +4564,26 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
@@ -4613,24 +4613,24 @@ define amdgpu_kernel void @atomic_load_i64(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -4665,12 +4665,12 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64
;
; GCN2-LABEL: atomic_load_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
@@ -4678,20 +4678,20 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64
; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT
@@ -4728,31 +4728,31 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index)
;
; GCN2-LABEL: atomic_load_i64_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT
@@ -4783,23 +4783,23 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) {
;
; GCN2-LABEL: atomic_store_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: s_add_u32 s0, s2, 32
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: s_addc_u32 s1, s3, 0
+; GCN2-NEXT: s_add_u32 s0, s6, 32
+; GCN2-NEXT: s_addc_u32 s1, s7, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_store_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32
; GFX12-NEXT: s_endpgm
entry:
@@ -4822,21 +4822,21 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr %out) {
;
; GCN2-LABEL: atomic_store_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_store_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -4865,10 +4865,10 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64
; GCN2-LABEL: atomic_store_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s6, s0
; GCN2-NEXT: s_addc_u32 s1, s7, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -4882,14 +4882,14 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64
; GFX12-LABEL: atomic_store_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32
; GFX12-NEXT: s_endpgm
entry:
@@ -4918,10 +4918,10 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index
; GCN2-LABEL: atomic_store_i64_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s6, s0
; GCN2-NEXT: s_addc_u32 s1, s7, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -4933,14 +4933,14 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index
; GFX12-LABEL: atomic_store_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -4971,16 +4971,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old
; GCN2-LABEL: atomic_cmpxchg_i64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s2, s4, 32
-; GCN2-NEXT: s_addc_u32 s3, s5, 0
-; GCN2-NEXT: v_mov_b32_e32 v5, s3
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
+; GCN2-NEXT: v_mov_b32_e32 v5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v4, s2
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v4, s0
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -4990,11 +4990,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5027,16 +5027,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol
; GCN2-LABEL: atomic_cmpxchg_i64_soffset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s2, s4, 0x11940
-; GCN2-NEXT: s_addc_u32 s3, s5, 0
-; GCN2-NEXT: v_mov_b32_e32 v5, s3
+; GCN2-NEXT: s_add_u32 s0, s4, 0x11940
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
+; GCN2-NEXT: v_mov_b32_e32 v5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v4, s2
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v4, s0
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -5046,11 +5046,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %ol
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:72000
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5084,35 +5084,35 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i6
;
; GCN2-LABEL: atomic_cmpxchg_i64_ret_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v5, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: v_mov_b32_e32 v3, s7
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
+; GCN2-NEXT: v_mov_b32_e32 v2, s10
+; GCN2-NEXT: v_mov_b32_e32 v3, s11
; GCN2-NEXT: v_mov_b32_e32 v4, s0
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_cmpxchg_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -5146,18 +5146,18 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i
;
; GCN2-LABEL: atomic_cmpxchg_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[8:9], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v5, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: v_mov_b32_e32 v3, s7
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
+; GCN2-NEXT: v_mov_b32_e32 v2, s10
+; GCN2-NEXT: v_mov_b32_e32 v3, s11
; GCN2-NEXT: v_mov_b32_e32 v4, s0
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5166,12 +5166,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i
;
; GFX12-LABEL: atomic_cmpxchg_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[8:9], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11
; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -5212,19 +5212,19 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o
; GCN2-LABEL: atomic_cmpxchg_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: s_add_u32 s0, s4, s2
-; GCN2-NEXT: s_addc_u32 s3, s5, s3
-; GCN2-NEXT: s_add_u32 s2, s0, 32
-; GCN2-NEXT: s_addc_u32 s3, s3, 0
-; GCN2-NEXT: v_mov_b32_e32 v5, s3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
+; GCN2-NEXT: s_add_u32 s0, s0, 32
+; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s8
; GCN2-NEXT: v_mov_b32_e32 v1, s9
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v4, s2
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v4, s0
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -5237,13 +5237,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %o
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x44
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x44
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
-; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], s[2:3]
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5279,14 +5279,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) {
; GCN2-LABEL: atomic_cmpxchg_i64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v4, s4
; GCN2-NEXT: v_mov_b32_e32 v5, s5
; GCN2-NEXT: v_mov_b32_e32 v0, s6
; GCN2-NEXT: v_mov_b32_e32 v1, s7
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -5296,11 +5296,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5331,33 +5331,33 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in,
;
; GCN2-LABEL: atomic_cmpxchg_i64_ret:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v4, s0
-; GCN2-NEXT: v_mov_b32_e32 v5, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: v_mov_b32_e32 v3, s7
+; GCN2-NEXT: v_mov_b32_e32 v4, s4
+; GCN2-NEXT: v_mov_b32_e32 v5, s5
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
+; GCN2-NEXT: v_mov_b32_e32 v2, s10
+; GCN2-NEXT: v_mov_b32_e32 v3, s11
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_cmpxchg_i64_ret:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -5388,16 +5388,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind
;
; GCN2-LABEL: atomic_cmpxchg_i64_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[8:9], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v5, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
-; GCN2-NEXT: v_mov_b32_e32 v2, s6
-; GCN2-NEXT: v_mov_b32_e32 v3, s7
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
+; GCN2-NEXT: v_mov_b32_e32 v2, s10
+; GCN2-NEXT: v_mov_b32_e32 v3, s11
; GCN2-NEXT: v_mov_b32_e32 v4, s0
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5406,12 +5406,12 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %ind
;
; GFX12-LABEL: atomic_cmpxchg_i64_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
-; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[8:9], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v3, s11
; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
@@ -5449,17 +5449,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6
; GCN2-LABEL: atomic_cmpxchg_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
-; GCN2-NEXT: s_add_u32 s2, s4, s2
-; GCN2-NEXT: s_addc_u32 s3, s5, s3
-; GCN2-NEXT: v_mov_b32_e32 v5, s3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
+; GCN2-NEXT: v_mov_b32_e32 v5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s8
; GCN2-NEXT: v_mov_b32_e32 v1, s9
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v4, s2
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v4, s0
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -5472,13 +5472,13 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i6
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x44
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x44
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
-; GFX12-NEXT: s_add_nc_u64 s[2:3], s[4:5], s[2:3]
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5512,26 +5512,26 @@ define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_f64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_f64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
@@ -5561,24 +5561,24 @@ define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) {
;
; GCN2-LABEL: atomic_load_f64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_f64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5613,12 +5613,12 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64
;
; GCN2-LABEL: atomic_load_f64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
@@ -5626,20 +5626,20 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64
; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_f64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT
@@ -5676,31 +5676,31 @@ define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index)
;
; GCN2-LABEL: atomic_load_f64_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_f64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: flat_load_b64 v[0:1], v[0:1] th:TH_LOAD_NT
@@ -5731,23 +5731,23 @@ define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) {
;
; GCN2-LABEL: atomic_store_f64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: s_add_u32 s0, s2, 32
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: s_addc_u32 s1, s3, 0
+; GCN2-NEXT: s_add_u32 s0, s6, 32
+; GCN2-NEXT: s_addc_u32 s1, s7, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_store_f64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32
; GFX12-NEXT: s_endpgm
entry:
@@ -5770,21 +5770,21 @@ define amdgpu_kernel void @atomic_store_f64(double %in, ptr %out) {
;
; GCN2-LABEL: atomic_store_f64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_store_f64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -5813,10 +5813,10 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out,
; GCN2-LABEL: atomic_store_f64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s6, s0
; GCN2-NEXT: s_addc_u32 s1, s7, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -5830,14 +5830,14 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out,
; GFX12-LABEL: atomic_store_f64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] offset:32
; GFX12-NEXT: s_endpgm
entry:
@@ -5866,10 +5866,10 @@ define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %in
; GCN2-LABEL: atomic_store_f64_addr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s6, s0
; GCN2-NEXT: s_addc_u32 s1, s7, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -5881,14 +5881,14 @@ define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %in
; GFX12-LABEL: atomic_store_f64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -5915,13 +5915,13 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_inc_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5930,10 +5930,10 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_inc_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5966,20 +5966,20 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_inc_i64_ret_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
@@ -5987,10 +5987,10 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6027,10 +6027,10 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 %
; GCN2-LABEL: atomic_inc_i64_incr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -6046,12 +6046,12 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 %
; GFX12-LABEL: atomic_inc_i64_incr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32
@@ -6089,38 +6089,38 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2,
;
; GCN2-LABEL: atomic_inc_i64_ret_incr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_inc_i64_ret_incr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -6147,12 +6147,12 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_inc_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -6160,10 +6160,10 @@ define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_inc_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6194,12 +6194,12 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-LABEL: atomic_inc_i64_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -6212,10 +6212,10 @@ define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6249,10 +6249,10 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index)
; GCN2-LABEL: atomic_inc_i64_incr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -6266,12 +6266,12 @@ define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index)
; GFX12-LABEL: atomic_inc_i64_incr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1]
@@ -6306,36 +6306,36 @@ define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_inc_i64_ret_incr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_inc_i64_ret_incr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -6363,13 +6363,13 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_dec_i64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -6378,10 +6378,10 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_dec_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] offset:32
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6414,20 +6414,20 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_dec_i64_ret_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: s_add_u32 s0, s0, 32
-; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: s_add_u32 s0, s4, 32
+; GCN2-NEXT: s_addc_u32 s1, s5, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
@@ -6435,10 +6435,10 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6475,10 +6475,10 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %
; GCN2-LABEL: atomic_dec_i64_decr64_offset:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
@@ -6494,12 +6494,12 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %
; GFX12-LABEL: atomic_dec_i64_decr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1] offset:32
@@ -6537,38 +6537,38 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2,
;
; GCN2-LABEL: atomic_dec_i64_ret_decr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_dec_i64_ret_decr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
@@ -6595,12 +6595,12 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_dec_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -6608,10 +6608,10 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) {
;
; GFX12-LABEL: atomic_dec_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3]
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6642,12 +6642,12 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GCN2-LABEL: atomic_dec_i64_ret:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: v_mov_b32_e32 v2, s0
-; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
@@ -6660,10 +6660,10 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) {
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6697,10 +6697,10 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index)
; GCN2-LABEL: atomic_dec_i64_decr64:
; GCN2: ; %bb.0: ; %entry
; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GCN2-NEXT: s_add_u32 s0, s4, s0
; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
@@ -6714,12 +6714,12 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index)
; GFX12-LABEL: atomic_dec_i64_decr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_dec_u64 v[2:3], v[0:1]
@@ -6754,36 +6754,36 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_dec_i64_ret_decr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s8
+; GCN2-NEXT: v_mov_b32_e32 v1, s9
; GCN2-NEXT: v_mov_b32_e32 v2, s0
; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_mov_b32_e32 v2, s2
-; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_dec_i64_ret_decr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX12-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
; GFX12-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
index d812b4b7d86e6..19601b111efcb 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
@@ -4292,24 +4292,24 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %
;
; GCN2-LABEL: atomic_max_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v5, s1
; GCN2-NEXT: v_mov_b32_e32 v4, s0
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GCN2-NEXT: s_mov_b64 s[0:1], 0
-; GCN2-NEXT: v_mov_b32_e32 v6, s3
-; GCN2-NEXT: v_mov_b32_e32 v7, s2
+; GCN2-NEXT: v_mov_b32_e32 v6, s7
+; GCN2-NEXT: v_mov_b32_e32 v7, s6
; GCN2-NEXT: .LBB88_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
+; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -4402,25 +4402,25 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
;
; GCN2-LABEL: atomic_max_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s6
-; GCN2-NEXT: s_addc_u32 s1, s1, s7
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_mov_b64 s[0:1], 0
-; GCN2-NEXT: v_mov_b32_e32 v4, s5
-; GCN2-NEXT: v_mov_b32_e32 v5, s4
+; GCN2-NEXT: v_mov_b32_e32 v4, s9
+; GCN2-NEXT: v_mov_b32_e32 v5, s8
; GCN2-NEXT: .LBB89_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v9, v3
; GCN2-NEXT: v_mov_b32_e32 v8, v2
-; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[8:9]
; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -4432,30 +4432,30 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GCN2-NEXT: s_cbranch_execnz .LBB89_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_max_i64_ret_addr64_offset:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GCN3-NEXT: s_add_u32 s0, s0, s6
-; GCN3-NEXT: s_addc_u32 s1, s1, s7
+; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN3-NEXT: s_add_u32 s0, s4, s0
+; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
; GCN3-NEXT: s_mov_b64 s[0:1], 0
-; GCN3-NEXT: v_mov_b32_e32 v4, s5
-; GCN3-NEXT: v_mov_b32_e32 v5, s4
+; GCN3-NEXT: v_mov_b32_e32 v4, s9
+; GCN3-NEXT: v_mov_b32_e32 v5, s8
; GCN3-NEXT: .LBB89_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v9, v3
; GCN3-NEXT: v_mov_b32_e32 v8, v2
-; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[8:9]
; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc
@@ -4467,8 +4467,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GCN3-NEXT: s_cbranch_execnz .LBB89_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_endpgm
entry:
@@ -4514,22 +4514,22 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index)
;
; GCN2-LABEL: atomic_max_i64_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v5, s1
; GCN2-NEXT: v_mov_b32_e32 v4, s0
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GCN2-NEXT: s_mov_b64 s[0:1], 0
-; GCN2-NEXT: v_mov_b32_e32 v6, s3
-; GCN2-NEXT: v_mov_b32_e32 v7, s2
+; GCN2-NEXT: v_mov_b32_e32 v6, s7
+; GCN2-NEXT: v_mov_b32_e32 v7, s6
; GCN2-NEXT: .LBB90_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
+; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -4619,23 +4619,23 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_max_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s6
-; GCN2-NEXT: s_addc_u32 s1, s1, s7
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_mov_b64 s[0:1], 0
-; GCN2-NEXT: v_mov_b32_e32 v4, s5
-; GCN2-NEXT: v_mov_b32_e32 v5, s4
+; GCN2-NEXT: v_mov_b32_e32 v4, s9
+; GCN2-NEXT: v_mov_b32_e32 v5, s8
; GCN2-NEXT: .LBB91_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v9, v3
; GCN2-NEXT: v_mov_b32_e32 v8, v2
-; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; GCN2-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[8:9]
; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -4647,30 +4647,30 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GCN2-NEXT: s_cbranch_execnz .LBB91_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_max_i64_ret_addr64:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GCN3-NEXT: s_add_u32 s0, s0, s6
-; GCN3-NEXT: s_addc_u32 s1, s1, s7
+; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN3-NEXT: s_add_u32 s0, s4, s0
+; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GCN3-NEXT: s_mov_b64 s[0:1], 0
-; GCN3-NEXT: v_mov_b32_e32 v4, s5
-; GCN3-NEXT: v_mov_b32_e32 v5, s4
+; GCN3-NEXT: v_mov_b32_e32 v4, s9
+; GCN3-NEXT: v_mov_b32_e32 v5, s8
; GCN3-NEXT: .LBB91_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v9, v3
; GCN3-NEXT: v_mov_b32_e32 v8, v2
-; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; GCN3-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[8:9]
; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -4682,8 +4682,8 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GCN3-NEXT: s_cbranch_execnz .LBB91_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_endpgm
entry:
@@ -5674,24 +5674,24 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64
;
; GCN2-LABEL: atomic_umax_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v5, s1
; GCN2-NEXT: v_mov_b32_e32 v4, s0
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GCN2-NEXT: s_mov_b64 s[0:1], 0
-; GCN2-NEXT: v_mov_b32_e32 v6, s3
-; GCN2-NEXT: v_mov_b32_e32 v7, s2
+; GCN2-NEXT: v_mov_b32_e32 v6, s7
+; GCN2-NEXT: v_mov_b32_e32 v7, s6
; GCN2-NEXT: .LBB102_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
+; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -5784,25 +5784,25 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
;
; GCN2-LABEL: atomic_umax_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s6
-; GCN2-NEXT: s_addc_u32 s1, s1, s7
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_mov_b64 s[0:1], 0
-; GCN2-NEXT: v_mov_b32_e32 v4, s5
-; GCN2-NEXT: v_mov_b32_e32 v5, s4
+; GCN2-NEXT: v_mov_b32_e32 v4, s9
+; GCN2-NEXT: v_mov_b32_e32 v5, s8
; GCN2-NEXT: .LBB103_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v9, v3
; GCN2-NEXT: v_mov_b32_e32 v8, v2
-; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[8:9]
; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -5814,30 +5814,30 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
; GCN2-NEXT: s_cbranch_execnz .LBB103_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umax_i64_ret_addr64_offset:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GCN3-NEXT: s_add_u32 s0, s0, s6
-; GCN3-NEXT: s_addc_u32 s1, s1, s7
+; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN3-NEXT: s_add_u32 s0, s4, s0
+; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
; GCN3-NEXT: s_mov_b64 s[0:1], 0
-; GCN3-NEXT: v_mov_b32_e32 v4, s5
-; GCN3-NEXT: v_mov_b32_e32 v5, s4
+; GCN3-NEXT: v_mov_b32_e32 v4, s9
+; GCN3-NEXT: v_mov_b32_e32 v5, s8
; GCN3-NEXT: .LBB103_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v9, v3
; GCN3-NEXT: v_mov_b32_e32 v8, v2
-; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[8:9]
; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc
@@ -5849,8 +5849,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2
; GCN3-NEXT: s_cbranch_execnz .LBB103_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_endpgm
entry:
@@ -5899,23 +5899,23 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
;
; GCN2-LABEL: atomic_umax_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s6
-; GCN2-NEXT: s_addc_u32 s1, s1, s7
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_mov_b64 s[0:1], 0
-; GCN2-NEXT: v_mov_b32_e32 v4, s5
-; GCN2-NEXT: v_mov_b32_e32 v5, s4
+; GCN2-NEXT: v_mov_b32_e32 v4, s9
+; GCN2-NEXT: v_mov_b32_e32 v5, s8
; GCN2-NEXT: .LBB104_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v9, v3
; GCN2-NEXT: v_mov_b32_e32 v8, v2
-; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[8:9]
; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -5927,30 +5927,30 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
; GCN2-NEXT: s_cbranch_execnz .LBB104_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_umax_i64_ret_addr64:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GCN3-NEXT: s_add_u32 s0, s0, s6
-; GCN3-NEXT: s_addc_u32 s1, s1, s7
+; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN3-NEXT: s_add_u32 s0, s4, s0
+; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GCN3-NEXT: s_mov_b64 s[0:1], 0
-; GCN3-NEXT: v_mov_b32_e32 v4, s5
-; GCN3-NEXT: v_mov_b32_e32 v5, s4
+; GCN3-NEXT: v_mov_b32_e32 v4, s9
+; GCN3-NEXT: v_mov_b32_e32 v5, s8
; GCN3-NEXT: .LBB104_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v9, v3
; GCN3-NEXT: v_mov_b32_e32 v8, v2
-; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[8:9]
; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -5962,8 +5962,8 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %
; GCN3-NEXT: s_cbranch_execnz .LBB104_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_endpgm
entry:
@@ -7898,24 +7898,24 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %
;
; GCN2-LABEL: atomic_min_i64_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s4
-; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v5, s1
; GCN2-NEXT: v_mov_b32_e32 v4, s0
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
; GCN2-NEXT: s_mov_b64 s[0:1], 0
-; GCN2-NEXT: v_mov_b32_e32 v6, s3
-; GCN2-NEXT: v_mov_b32_e32 v7, s2
+; GCN2-NEXT: v_mov_b32_e32 v6, s7
+; GCN2-NEXT: v_mov_b32_e32 v7, s6
; GCN2-NEXT: .LBB125_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
+; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -8008,25 +8008,25 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
;
; GCN2-LABEL: atomic_min_i64_ret_addr64_offset:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s6
-; GCN2-NEXT: s_addc_u32 s1, s1, s7
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: s_add_u32 s0, s0, 32
; GCN2-NEXT: s_addc_u32 s1, s1, 0
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_mov_b64 s[0:1], 0
-; GCN2-NEXT: v_mov_b32_e32 v4, s5
-; GCN2-NEXT: v_mov_b32_e32 v5, s4
+; GCN2-NEXT: v_mov_b32_e32 v4, s9
+; GCN2-NEXT: v_mov_b32_e32 v5, s8
; GCN2-NEXT: .LBB126_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v9, v3
; GCN2-NEXT: v_mov_b32_e32 v8, v2
-; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[8:9]
; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -8038,30 +8038,30 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GCN2-NEXT: s_cbranch_execnz .LBB126_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_min_i64_ret_addr64_offset:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GCN3-NEXT: s_add_u32 s0, s0, s6
-; GCN3-NEXT: s_addc_u32 s1, s1, s7
+; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN3-NEXT: s_add_u32 s0, s4, s0
+; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1] offset:32
; GCN3-NEXT: s_mov_b64 s[0:1], 0
-; GCN3-NEXT: v_mov_b32_e32 v4, s5
-; GCN3-NEXT: v_mov_b32_e32 v5, s4
+; GCN3-NEXT: v_mov_b32_e32 v4, s9
+; GCN3-NEXT: v_mov_b32_e32 v5, s8
; GCN3-NEXT: .LBB126_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v9, v3
; GCN3-NEXT: v_mov_b32_e32 v8, v2
-; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[8:9]
; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc
@@ -8073,8 +8073,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2,
; GCN3-NEXT: s_cbranch_execnz .LBB126_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_endpgm
entry:
@@ -8118,20 +8118,20 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
;
; GCN2-LABEL: atomic_min_i64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN2-NEXT: s_mov_b64 s[4:5], 0
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN2-NEXT: s_mov_b64 s[0:1], 0
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s0
-; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v5, s1
-; GCN2-NEXT: v_mov_b32_e32 v6, s3
-; GCN2-NEXT: v_mov_b32_e32 v7, s2
-; GCN2-NEXT: v_mov_b32_e32 v4, s0
+; GCN2-NEXT: v_mov_b32_e32 v4, s4
+; GCN2-NEXT: v_mov_b32_e32 v6, s7
+; GCN2-NEXT: v_mov_b32_e32 v7, s6
+; GCN2-NEXT: v_mov_b32_e32 v5, s5
; GCN2-NEXT: .LBB127_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
+; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -8139,29 +8139,29 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
; GCN2-NEXT: buffer_wbinvl1_vol
; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GCN2-NEXT: v_mov_b32_e32 v3, v1
-; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN2-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GCN2-NEXT: v_mov_b32_e32 v2, v0
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN2-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN2-NEXT: s_cbranch_execnz .LBB127_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_min_i64:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN3-NEXT: s_mov_b64 s[4:5], 0
+; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN3-NEXT: s_mov_b64 s[0:1], 0
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s0
-; GCN3-NEXT: v_mov_b32_e32 v1, s1
+; GCN3-NEXT: v_mov_b32_e32 v0, s4
+; GCN3-NEXT: v_mov_b32_e32 v1, s5
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GCN3-NEXT: v_mov_b32_e32 v5, s1
-; GCN3-NEXT: v_mov_b32_e32 v6, s3
-; GCN3-NEXT: v_mov_b32_e32 v7, s2
-; GCN3-NEXT: v_mov_b32_e32 v4, s0
+; GCN3-NEXT: v_mov_b32_e32 v4, s4
+; GCN3-NEXT: v_mov_b32_e32 v6, s7
+; GCN3-NEXT: v_mov_b32_e32 v7, s6
+; GCN3-NEXT: v_mov_b32_e32 v5, s5
; GCN3-NEXT: .LBB127_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
+; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
; GCN3-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; GCN3-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -8169,9 +8169,9 @@ define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
; GCN3-NEXT: buffer_wbinvl1_vol
; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GCN3-NEXT: v_mov_b32_e32 v3, v1
-; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GCN3-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GCN3-NEXT: v_mov_b32_e32 v2, v0
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN3-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GCN3-NEXT: s_cbranch_execnz .LBB127_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_endpgm
@@ -8218,23 +8218,23 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
;
; GCN2-LABEL: atomic_min_i64_ret_addr64:
; GCN2: ; %bb.0: ; %entry
-; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN2-NEXT: s_waitcnt lgkmcnt(0)
-; GCN2-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GCN2-NEXT: s_add_u32 s0, s0, s6
-; GCN2-NEXT: s_addc_u32 s1, s1, s7
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
; GCN2-NEXT: v_mov_b32_e32 v0, s0
; GCN2-NEXT: v_mov_b32_e32 v1, s1
; GCN2-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GCN2-NEXT: s_mov_b64 s[0:1], 0
-; GCN2-NEXT: v_mov_b32_e32 v4, s5
-; GCN2-NEXT: v_mov_b32_e32 v5, s4
+; GCN2-NEXT: v_mov_b32_e32 v4, s9
+; GCN2-NEXT: v_mov_b32_e32 v5, s8
; GCN2-NEXT: .LBB128_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v9, v3
; GCN2-NEXT: v_mov_b32_e32 v8, v2
-; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; GCN2-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[8:9]
; GCN2-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN2-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN2-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -8246,30 +8246,30 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GCN2-NEXT: s_cbranch_execnz .LBB128_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN2-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN2-NEXT: v_mov_b32_e32 v0, s2
-; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
; GCN2-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN2-NEXT: s_endpgm
;
; GCN3-LABEL: atomic_min_i64_ret_addr64:
; GCN3: ; %bb.0: ; %entry
-; GCN3-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GCN3-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GCN3-NEXT: s_waitcnt lgkmcnt(0)
-; GCN3-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GCN3-NEXT: s_add_u32 s0, s0, s6
-; GCN3-NEXT: s_addc_u32 s1, s1, s7
+; GCN3-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GCN3-NEXT: s_add_u32 s0, s4, s0
+; GCN3-NEXT: s_addc_u32 s1, s5, s1
; GCN3-NEXT: v_mov_b32_e32 v0, s0
; GCN3-NEXT: v_mov_b32_e32 v1, s1
; GCN3-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
; GCN3-NEXT: s_mov_b64 s[0:1], 0
-; GCN3-NEXT: v_mov_b32_e32 v4, s5
-; GCN3-NEXT: v_mov_b32_e32 v5, s4
+; GCN3-NEXT: v_mov_b32_e32 v4, s9
+; GCN3-NEXT: v_mov_b32_e32 v5, s8
; GCN3-NEXT: .LBB128_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v9, v3
; GCN3-NEXT: v_mov_b32_e32 v8, v2
-; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; GCN3-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[8:9]
; GCN3-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; GCN3-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; GCN3-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -8281,8 +8281,8 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %i
; GCN3-NEXT: s_cbranch_execnz .LBB128_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN3-NEXT: s_or_b64 exec, exec, s[0:1]
-; GCN3-NEXT: v_mov_b32_e32 v0, s2
-; GCN3-NEXT: v_mov_b32_e32 v1, s3
+; GCN3-NEXT: v_mov_b32_e32 v0, s6
+; GCN3-NEXT: v_mov_b32_e32 v1, s7
; GCN3-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN3-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
index 2a9a9ef7c43b6..7bbbb6d14e809 100644
--- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
@@ -948,12 +948,12 @@ define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out,
; GFX11-NOFMA: ; %bb.0:
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -967,12 +967,12 @@ define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out,
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2
; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[4:5]
@@ -1041,12 +1041,12 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out,
; GFX11-NOFMA: ; %bb.0:
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NOFMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -1060,12 +1060,12 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out,
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2
; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[4:5]
@@ -1133,12 +1133,12 @@ define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out,
; GFX11-NOFMA: ; %bb.0:
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
@@ -1153,12 +1153,12 @@ define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out,
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -1226,12 +1226,12 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out,
; GFX11-NOFMA: ; %bb.0:
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
@@ -1246,12 +1246,12 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out,
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -1319,12 +1319,12 @@ define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out,
; GFX11-NOFMA: ; %bb.0:
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, 1.0, v1
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
@@ -1339,12 +1339,12 @@ define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out,
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, v2
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -1412,12 +1412,12 @@ define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out,
; GFX11-NOFMA: ; %bb.0:
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, 1.0, v1
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
@@ -1432,12 +1432,12 @@ define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out,
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, v2
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -1505,12 +1505,12 @@ define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out,
; GFX11-NOFMA: ; %bb.0:
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, -1.0, v1
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
@@ -1525,12 +1525,12 @@ define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out,
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, -v2
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -1598,12 +1598,12 @@ define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out,
; GFX11-NOFMA: ; %bb.0:
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
; GFX11-NOFMA-NEXT: v_sub_f32_e32 v1, -1.0, v1
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
@@ -1618,12 +1618,12 @@ define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out,
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fma_f32 v1, -v1, v2, -v2
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -1691,12 +1691,12 @@ define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out,
; GFX11-NOFMA: ; %bb.0:
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
@@ -1711,12 +1711,12 @@ define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out,
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -1784,12 +1784,12 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out,
; GFX11-NOFMA: ; %bb.0:
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, -1.0, v1
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
@@ -1804,12 +1804,12 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out,
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, -v2
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -1877,12 +1877,12 @@ define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out,
; GFX11-NOFMA: ; %bb.0:
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
@@ -1897,12 +1897,12 @@ define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out,
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2
; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[4:5]
@@ -1970,12 +1970,12 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out,
; GFX11-NOFMA: ; %bb.0:
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NOFMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NOFMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: s_clause 0x1
; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
; GFX11-NOFMA-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
@@ -1990,12 +1990,12 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out,
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fmac_f32_e32 v2, v1, v2
; GFX11-FMA-NEXT: global_store_b32 v0, v2, s[4:5]
@@ -2081,13 +2081,13 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out,
;
; GFX11-NOFMA-LABEL: test_f32_interp:
; GFX11-NOFMA: ; %bb.0:
-; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NOFMA-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: s_clause 0x2
-; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[4:5]
-; GFX11-NOFMA-NEXT: global_load_b32 v3, v0, s[2:3]
+; GFX11-NOFMA-NEXT: global_load_b32 v1, v0, s[10:11]
+; GFX11-NOFMA-NEXT: global_load_b32 v2, v0, s[8:9]
+; GFX11-NOFMA-NEXT: global_load_b32 v3, v0, s[6:7]
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(2)
; GFX11-NOFMA-NEXT: v_sub_f32_e32 v4, 1.0, v1
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
@@ -2095,26 +2095,26 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out,
; GFX11-NOFMA-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-NOFMA-NEXT: v_fmac_f32_e32 v2, v3, v1
-; GFX11-NOFMA-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11-NOFMA-NEXT: global_store_b32 v0, v2, s[4:5]
; GFX11-NOFMA-NEXT: s_nop 0
; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NOFMA-NEXT: s_endpgm
;
; GFX11-FMA-LABEL: test_f32_interp:
; GFX11-FMA: ; %bb.0:
-; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-FMA-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-FMA-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x2
-; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[4:5]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v3, v0, s[2:3]
+; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[8:9]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[10:11]
+; GFX11-FMA-NEXT: global_load_b32 v3, v0, s[6:7]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(1)
; GFX11-FMA-NEXT: v_fma_f32 v1, -v2, v1, v1
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FMA-NEXT: v_fmac_f32_e32 v1, v3, v2
-; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-FMA-NEXT: s_nop 0
; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FMA-NEXT: s_endpgm
@@ -2165,13 +2165,13 @@ define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out,
;
; GFX11-NOFMA-LABEL: test_f64_interp:
; GFX11-NOFMA: ; %bb.0:
-; GFX11-NOFMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NOFMA-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NOFMA-NEXT: v_mov_b32_e32 v8, 0
; GFX11-NOFMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NOFMA-NEXT: s_clause 0x2
-; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v8, s[6:7]
-; GFX11-NOFMA-NEXT: global_load_b64 v[2:3], v8, s[4:5]
-; GFX11-NOFMA-NEXT: global_load_b64 v[4:5], v8, s[2:3]
+; GFX11-NOFMA-NEXT: global_load_b64 v[0:1], v8, s[10:11]
+; GFX11-NOFMA-NEXT: global_load_b64 v[2:3], v8, s[8:9]
+; GFX11-NOFMA-NEXT: global_load_b64 v[4:5], v8, s[6:7]
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(2)
; GFX11-NOFMA-NEXT: v_add_f64 v[6:7], -v[0:1], 1.0
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(1)
@@ -2179,26 +2179,26 @@ define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out,
; GFX11-NOFMA-NEXT: v_mul_f64 v[2:3], v[2:3], v[6:7]
; GFX11-NOFMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-NOFMA-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3]
-; GFX11-NOFMA-NEXT: global_store_b64 v8, v[0:1], s[0:1]
+; GFX11-NOFMA-NEXT: global_store_b64 v8, v[0:1], s[4:5]
; GFX11-NOFMA-NEXT: s_nop 0
; GFX11-NOFMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NOFMA-NEXT: s_endpgm
;
; GFX11-FMA-LABEL: test_f64_interp:
; GFX11-FMA: ; %bb.0:
-; GFX11-FMA-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-FMA-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-FMA-NEXT: v_mov_b32_e32 v6, 0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x2
-; GFX11-FMA-NEXT: global_load_b64 v[0:1], v6, s[4:5]
-; GFX11-FMA-NEXT: global_load_b64 v[2:3], v6, s[6:7]
-; GFX11-FMA-NEXT: global_load_b64 v[4:5], v6, s[2:3]
+; GFX11-FMA-NEXT: global_load_b64 v[0:1], v6, s[8:9]
+; GFX11-FMA-NEXT: global_load_b64 v[2:3], v6, s[10:11]
+; GFX11-FMA-NEXT: global_load_b64 v[4:5], v6, s[6:7]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(1)
; GFX11-FMA-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], v[0:1]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FMA-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1]
-; GFX11-FMA-NEXT: global_store_b64 v6, v[0:1], s[0:1]
+; GFX11-FMA-NEXT: global_store_b64 v6, v[0:1], s[4:5]
; GFX11-FMA-NEXT: s_nop 0
; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FMA-NEXT: s_endpgm
@@ -2236,15 +2236,15 @@ define amdgpu_kernel void @fma_neg_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr ad
;
; GFX11-LABEL: fma_neg_2.0_neg_a_b_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:4 glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_fmac_f32_e32 v2, 2.0, v1
-; GFX11-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v2, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2282,15 +2282,15 @@ define amdgpu_kernel void @fma_2.0_neg_a_b_f32(ptr addrspace(1) %out, ptr addrsp
;
; GFX11-LABEL: fma_2.0_neg_a_b_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:4 glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] offset:4 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_fmac_f32_e32 v2, -2.0, v1
-; GFX11-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v2, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2333,19 +2333,19 @@ define amdgpu_kernel void @fma_neg_b_c_v4f32(ptr addrspace(1) %out, ptr addrspac
;
; GFX11-LABEL: fma_neg_b_c_v4f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v12, 4, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x2
-; GFX11-NEXT: global_load_b128 v[0:3], v12, s[2:3] offset:16
-; GFX11-NEXT: global_load_b128 v[4:7], v12, s[2:3]
-; GFX11-NEXT: global_load_b128 v[8:11], v12, s[2:3] offset:48
+; GFX11-NEXT: global_load_b128 v[0:3], v12, s[6:7] offset:16
+; GFX11-NEXT: global_load_b128 v[4:7], v12, s[6:7]
+; GFX11-NEXT: global_load_b128 v[8:11], v12, s[6:7] offset:48
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_fma_f32 v3, v11, -v7, -v3
; GFX11-NEXT: v_fma_f32 v2, v10, -v6, -v2
; GFX11-NEXT: v_fma_f32 v1, v9, -v5, -v1
; GFX11-NEXT: v_fma_f32 v0, v8, -v4, -v0
-; GFX11-NEXT: global_store_b128 v12, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b128 v12, v[0:3], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll
index 23eb73038917d..36d917f64c9de 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll
@@ -37,92 +37,92 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
;
; VI-LABEL: test_fmax3_olt_0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
+; VI-NEXT: buffer_load_dword v2, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_max3_f32 v0, v0, v1, v2
-; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fmax3_olt_0_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s14, s2
+; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s12, s2
-; GFX9-NEXT: s_mov_b32 s13, s3
-; GFX9-NEXT: s_mov_b32 s16, s4
-; GFX9-NEXT: s_mov_b32 s17, s5
-; GFX9-NEXT: s_mov_b32 s18, s10
-; GFX9-NEXT: s_mov_b32 s19, s11
-; GFX9-NEXT: s_mov_b32 s4, s6
-; GFX9-NEXT: s_mov_b32 s5, s7
-; GFX9-NEXT: s_mov_b32 s6, s10
-; GFX9-NEXT: s_mov_b32 s7, s11
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b32 s16, s8
+; GFX9-NEXT: s_mov_b32 s17, s9
+; GFX9-NEXT: s_mov_b32 s18, s2
+; GFX9-NEXT: s_mov_b32 s19, s3
+; GFX9-NEXT: s_mov_b32 s8, s10
+; GFX9-NEXT: s_mov_b32 s9, s11
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
+; GFX9-NEXT: buffer_load_dword v2, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s0
-; GFX9-NEXT: s_mov_b32 s9, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_max3_f32 v0, v0, v1, v2
-; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmax3_olt_0_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_max3_f32 v0, v0, v1, v2
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -169,92 +169,92 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
;
; VI-LABEL: test_fmax3_olt_1_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
+; VI-NEXT: buffer_load_dword v2, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_max3_f32 v0, v2, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fmax3_olt_1_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s14, s2
+; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s12, s2
-; GFX9-NEXT: s_mov_b32 s13, s3
-; GFX9-NEXT: s_mov_b32 s16, s4
-; GFX9-NEXT: s_mov_b32 s17, s5
-; GFX9-NEXT: s_mov_b32 s18, s10
-; GFX9-NEXT: s_mov_b32 s19, s11
-; GFX9-NEXT: s_mov_b32 s4, s6
-; GFX9-NEXT: s_mov_b32 s5, s7
-; GFX9-NEXT: s_mov_b32 s6, s10
-; GFX9-NEXT: s_mov_b32 s7, s11
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b32 s16, s8
+; GFX9-NEXT: s_mov_b32 s17, s9
+; GFX9-NEXT: s_mov_b32 s18, s2
+; GFX9-NEXT: s_mov_b32 s19, s3
+; GFX9-NEXT: s_mov_b32 s8, s10
+; GFX9-NEXT: s_mov_b32 s9, s11
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
+; GFX9-NEXT: buffer_load_dword v2, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s0
-; GFX9-NEXT: s_mov_b32 s9, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_max3_f32 v0, v2, v0, v1
-; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmax3_olt_1_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_max3_f32 v0, v2, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -304,96 +304,96 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
;
; VI-LABEL: test_fmax3_olt_0_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: v_max_f16_e32 v1, v1, v1
; VI-NEXT: v_max_f16_e32 v0, v0, v1
; VI-NEXT: v_max_f16_e32 v1, v2, v2
; VI-NEXT: v_max_f16_e32 v0, v0, v1
-; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fmax3_olt_0_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s14, s2
+; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s12, s2
-; GFX9-NEXT: s_mov_b32 s13, s3
-; GFX9-NEXT: s_mov_b32 s16, s4
-; GFX9-NEXT: s_mov_b32 s17, s5
-; GFX9-NEXT: s_mov_b32 s18, s10
-; GFX9-NEXT: s_mov_b32 s19, s11
-; GFX9-NEXT: s_mov_b32 s4, s6
-; GFX9-NEXT: s_mov_b32 s5, s7
-; GFX9-NEXT: s_mov_b32 s6, s10
-; GFX9-NEXT: s_mov_b32 s7, s11
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b32 s16, s8
+; GFX9-NEXT: s_mov_b32 s17, s9
+; GFX9-NEXT: s_mov_b32 s18, s2
+; GFX9-NEXT: s_mov_b32 s19, s3
+; GFX9-NEXT: s_mov_b32 s8, s10
+; GFX9-NEXT: s_mov_b32 s9, s11
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; GFX9-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s0
-; GFX9-NEXT: s_mov_b32 s9, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_max3_f16 v0, v0, v1, v2
-; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0
+; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmax3_olt_0_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_max3_f16 v0, v0, v1, v2
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -444,96 +444,96 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
;
; VI-LABEL: test_fmax3_olt_1_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: v_max_f16_e32 v1, v1, v1
; VI-NEXT: v_max_f16_e32 v0, v0, v1
; VI-NEXT: v_max_f16_e32 v1, v2, v2
; VI-NEXT: v_max_f16_e32 v0, v1, v0
-; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fmax3_olt_1_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s14, s2
+; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s12, s2
-; GFX9-NEXT: s_mov_b32 s13, s3
-; GFX9-NEXT: s_mov_b32 s16, s4
-; GFX9-NEXT: s_mov_b32 s17, s5
-; GFX9-NEXT: s_mov_b32 s18, s10
-; GFX9-NEXT: s_mov_b32 s19, s11
-; GFX9-NEXT: s_mov_b32 s4, s6
-; GFX9-NEXT: s_mov_b32 s5, s7
-; GFX9-NEXT: s_mov_b32 s6, s10
-; GFX9-NEXT: s_mov_b32 s7, s11
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b32 s16, s8
+; GFX9-NEXT: s_mov_b32 s17, s9
+; GFX9-NEXT: s_mov_b32 s18, s2
+; GFX9-NEXT: s_mov_b32 s19, s3
+; GFX9-NEXT: s_mov_b32 s8, s10
+; GFX9-NEXT: s_mov_b32 s9, s11
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; GFX9-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s0
-; GFX9-NEXT: s_mov_b32 s9, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_max3_f16 v0, v2, v0, v1
-; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0
+; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmax3_olt_1_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_max3_f16 v0, v2, v0, v1
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll
index 01b2f207388e8..35621f892aa6d 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll
@@ -28,15 +28,15 @@ define amdgpu_kernel void @test_fmax_legacy_uge_f64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: test_fmax_legacy_uge_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_nlt_f64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -80,15 +80,15 @@ define amdgpu_kernel void @test_fmax_legacy_oge_f64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: test_fmax_legacy_oge_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_ge_f64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -132,15 +132,15 @@ define amdgpu_kernel void @test_fmax_legacy_ugt_f64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: test_fmax_legacy_ugt_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_nle_f64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -184,15 +184,15 @@ define amdgpu_kernel void @test_fmax_legacy_ogt_f64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: test_fmax_legacy_ogt_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum.ll b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
index 87ac95a1cd739..a8815c204cd39 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
@@ -263,12 +263,12 @@ define amdgpu_kernel void @fmaximumi_f32_move_to_valu(ptr addrspace(1) %out, ptr
; GCN: ; %bb.0:
; GCN-NEXT: s_clause 0x1
; GCN-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: global_load_b32 v1, v0, s[6:7] scope:SCOPE_SYS
; GCN-NEXT: s_wait_loadcnt 0x0
-; GCN-NEXT: global_load_b32 v2, v0, s[0:1] scope:SCOPE_SYS
+; GCN-NEXT: global_load_b32 v2, v0, s[2:3] scope:SCOPE_SYS
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: v_maximum_f32 v1, v1, v2
; GCN-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -287,12 +287,12 @@ define amdgpu_kernel void @fmaximum_f16_move_to_valu(ptr addrspace(1) %out, ptr
; GCN: ; %bb.0:
; GCN-NEXT: s_clause 0x1
; GCN-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: global_load_u16 v1, v0, s[6:7] scope:SCOPE_SYS
; GCN-NEXT: s_wait_loadcnt 0x0
-; GCN-NEXT: global_load_u16 v2, v0, s[0:1] scope:SCOPE_SYS
+; GCN-NEXT: global_load_u16 v2, v0, s[2:3] scope:SCOPE_SYS
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: v_maximum_f16 v1, v1, v2
; GCN-NEXT: global_store_b16 v0, v1, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index 764fb992d4d34..4543038423be0 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -45,15 +45,15 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o
;
; VI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -63,16 +63,16 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o
;
; VI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -83,27 +83,27 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(ptr addrspace(1) %o
;
; GFX9-LABEL: v_test_nnan_input_fmed3_r_i_i_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_nnan_input_fmed3_r_i_i_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -156,15 +156,15 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt
;
; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -174,16 +174,16 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt
;
; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -194,27 +194,27 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_f32(ptr addrspace(1) %out, pt
;
; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -268,15 +268,15 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1)
;
; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -286,16 +286,16 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1)
;
; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -306,27 +306,27 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute0_f32(ptr addrspace(1)
;
; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_commute0_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -380,15 +380,15 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1)
;
; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -398,16 +398,16 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1)
;
; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -418,27 +418,27 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_commute1_f32(ptr addrspace(1)
;
; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_commute1_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -494,15 +494,15 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp
;
; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -513,17 +513,17 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp
;
; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -534,28 +534,28 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_constant_order_f32(ptr addrsp
;
; GFX9-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_max_f32_e32 v1, 4.0, v1
; GFX9-NEXT: v_min_f32_e32 v1, 2.0, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_fmed3_nnan_r_i_i_constant_order_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_maxmin_f32 v1, v1, 4.0, 2.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -617,15 +617,15 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1
;
; VI-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -639,16 +639,16 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1
;
; VI-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -663,50 +663,50 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1
;
; GFX9-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-SDAG-NEXT: v_max_f32_e32 v1, 2.0, v1
; GFX9-SDAG-NEXT: v_min_f32_e32 v2, 4.0, v1
-; GFX9-SDAG-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v2, s[4:5]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-GISEL-NEXT: v_max_f32_e32 v2, 2.0, v1
; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v2, s[4:5]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_max_f32_e32 v1, 2.0, v1
; GFX11-SDAG-NEXT: v_min_f32_e32 v2, 4.0, v1
-; GFX11-SDAG-NEXT: global_store_b32 v0, v2, s[0:1] dlc
+; GFX11-SDAG-NEXT: global_store_b32 v0, v2, s[4:5] dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] dlc
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5] dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -714,18 +714,18 @@ define amdgpu_kernel void @v_test_fmed3_nnan_r_i_i_multi_use_f32(ptr addrspace(1
;
; GFX11-GISEL-LABEL: v_test_fmed3_nnan_r_i_i_multi_use_f32:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_med3_f32 v2, v1, 2.0, 4.0
; GFX11-GISEL-NEXT: v_max_f32_e32 v1, 2.0, v1
-; GFX11-GISEL-NEXT: global_store_b32 v0, v2, s[0:1] dlc
+; GFX11-GISEL-NEXT: global_store_b32 v0, v2, s[4:5] dlc
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] dlc
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5] dlc
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -783,15 +783,15 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add
;
; VI-SDAG-LABEL: v_test_fmed3_r_i_i_f64:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
@@ -802,16 +802,16 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add
;
; VI-GISEL-LABEL: v_test_fmed3_r_i_i_f64:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v4
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v4
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -823,29 +823,29 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(ptr addrspace(1) %out, ptr add
;
; GFX9-LABEL: v_test_fmed3_r_i_i_f64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], 2.0
; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_fmed3_r_i_i_f64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], 2.0
; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -897,15 +897,15 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out,
;
; VI-SDAG-LABEL: v_test_fmed3_r_i_i_no_nans_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_med3_f32 v2, v3, 2.0, 4.0
@@ -914,16 +914,16 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out,
;
; VI-GISEL-LABEL: v_test_fmed3_r_i_i_no_nans_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -933,24 +933,24 @@ define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_test_fmed3_r_i_i_no_nans_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_fmed3_r_i_i_no_nans_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1004,15 +1004,15 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out,
;
; VI-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -1022,16 +1022,16 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out,
;
; VI-GISEL-LABEL: v_test_legacy_fmed3_r_i_i_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -1045,52 +1045,52 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out,
;
; GFX9-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_legacy_fmed3_r_i_i_f32:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-GISEL-NEXT: v_cmp_nlt_f32_e32 vcc, 2.0, v1
; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 2.0, vcc
; GFX9-GISEL-NEXT: v_cmp_ngt_f32_e32 vcc, 4.0, v1
; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 4.0, vcc
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_legacy_fmed3_r_i_i_f32:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, 2.0, 4.0
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_legacy_fmed3_r_i_i_f32:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -1098,7 +1098,7 @@ define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(ptr addrspace(1) %out,
; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 2.0, vcc_lo
; GFX11-GISEL-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 4.0, v1
; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v1, 4.0, vcc_lo
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -1170,17 +1170,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -1188,8 +1188,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, -v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -1197,19 +1197,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -1218,8 +1218,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v4, -1.0, v7
@@ -1229,67 +1229,67 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(ptr addrspa
;
; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_med3_f32 v1, -v1, v2, v3
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1
; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_med3_f32 v1, -v1, v2, v3
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -1360,17 +1360,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -1378,8 +1378,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v7, -v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -1387,19 +1387,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -1408,8 +1408,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v2, -1.0, v2
@@ -1419,67 +1419,67 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(ptr addrspa
;
; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, -v2, v3
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_max_f32_e64 v2, -v2, -v2
; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, -v2, v3
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod1:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_max_f32_e64 v2, -v2, -v2
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -1550,17 +1550,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -1568,8 +1568,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, -v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -1577,19 +1577,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -1598,8 +1598,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v3, -1.0, v3
@@ -1609,67 +1609,67 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(ptr addrspa
;
; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_med3_f32 v1, v1, v2, -v3
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_max_f32_e64 v3, -v3, -v3
; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_med3_f32 v1, v1, v2, -v3
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod2:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_max_f32_e64 v3, -v3, -v3
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -1741,17 +1741,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -1759,8 +1759,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, -v7, |v2|, -|v3|
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -1768,19 +1768,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -1789,8 +1789,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v4, -1.0, v7
@@ -1801,69 +1801,69 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(ptr addrs
;
; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_med3_f32 v1, -v1, |v2|, -|v3|
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1
; GFX9-GISEL-NEXT: v_max_f32_e64 v3, -|v3|, -|v3|
; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, |v2|, v3
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_med3_f32 v1, -v1, |v2|, -|v3|
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod012:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1
; GFX11-GISEL-NEXT: v_max_f32_e64 v3, -|v3|, -|v3|
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, |v2|, v3
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -1942,17 +1942,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -1960,8 +1960,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, -|v7|, -|v2|, -|v3|
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -1969,19 +1969,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -1990,8 +1990,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e64 v4, -1.0, |v7|
@@ -2003,71 +2003,71 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(ptr addrs
;
; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_med3_f32 v1, -|v1|, -|v2|, -|v3|
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_max_f32_e64 v1, -|v1|, -|v1|
; GFX9-GISEL-NEXT: v_max_f32_e64 v2, -|v2|, -|v2|
; GFX9-GISEL-NEXT: v_max_f32_e64 v3, -|v3|, -|v3|
; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_med3_f32 v1, -|v1|, -|v2|, -|v3|
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_negabs012:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_max_f32_e64 v1, -|v1|, -|v1|
; GFX11-GISEL-NEXT: v_max_f32_e64 v2, -|v2|, -|v2|
; GFX11-GISEL-NEXT: v_max_f32_e64 v3, -|v3|, -|v3|
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -2151,17 +2151,17 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt
;
; VI-SDAG-LABEL: v_nnan_inputs_med3_f32_pat0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -2169,8 +2169,8 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_add_f32_e32 v4, 1.0, v7
; VI-SDAG-NEXT: v_add_f32_e32 v2, 2.0, v2
@@ -2181,19 +2181,19 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt
;
; VI-GISEL-LABEL: v_nnan_inputs_med3_f32_pat0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -2202,8 +2202,8 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_add_f32_e32 v4, 1.0, v7
@@ -2215,38 +2215,38 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(ptr addrspace(1) %out, pt
;
; GFX9-LABEL: v_nnan_inputs_med3_f32_pat0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2
; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_nnan_inputs_med3_f32_pat0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2
; GFX11-NEXT: v_add_f32_e32 v3, 4.0, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2320,17 +2320,17 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou
;
; VI-SDAG-LABEL: v_nnan_input_calls_med3_f32_pat0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -2338,8 +2338,8 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -2347,19 +2347,19 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou
;
; VI-GISEL-LABEL: v_nnan_input_calls_med3_f32_pat0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -2368,8 +2368,8 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
@@ -2378,32 +2378,32 @@ define amdgpu_kernel void @v_nnan_input_calls_med3_f32_pat0(ptr addrspace(1) %ou
;
; GFX9-LABEL: v_nnan_input_calls_med3_f32_pat0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_nnan_input_calls_med3_f32_pat0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2472,17 +2472,17 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
;
; VI-SDAG-LABEL: v_nnan_call_med3_f32_pat0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -2490,8 +2490,8 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -2499,19 +2499,19 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
;
; VI-GISEL-LABEL: v_nnan_call_med3_f32_pat0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -2520,8 +2520,8 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
@@ -2530,32 +2530,32 @@ define amdgpu_kernel void @v_nnan_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_nnan_call_med3_f32_pat0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_nnan_call_med3_f32_pat0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2624,17 +2624,17 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
;
; VI-SDAG-LABEL: v_fast_call_med3_f32_pat0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -2642,8 +2642,8 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -2651,19 +2651,19 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
;
; VI-GISEL-LABEL: v_fast_call_med3_f32_pat0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -2672,8 +2672,8 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
@@ -2682,32 +2682,32 @@ define amdgpu_kernel void @v_fast_call_med3_f32_pat0(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_fast_call_med3_f32_pat0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_fast_call_med3_f32_pat0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2788,17 +2788,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -2806,8 +2806,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -2815,19 +2815,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -2836,8 +2836,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
@@ -2846,32 +2846,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(ptr addrspace(1) %o
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2940,17 +2940,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -2958,8 +2958,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -2967,19 +2967,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -2988,8 +2988,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
@@ -2998,32 +2998,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(ptr addrspace(1) %o
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -3093,17 +3093,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -3111,8 +3111,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, -v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -3120,19 +3120,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -3141,8 +3141,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v4, -1.0, v7
@@ -3152,67 +3152,67 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1_srcmod0(ptr addrspa
;
; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_med3_f32 v1, -v1, v2, v3
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1
; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_med3_f32 v1, -v1, v2, v3
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat1_srcmod0:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -3282,17 +3282,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat2:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -3300,8 +3300,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -3309,19 +3309,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat2:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -3330,8 +3330,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
@@ -3340,32 +3340,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(ptr addrspace(1) %o
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat2:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat2:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -3434,17 +3434,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat3:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -3452,8 +3452,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -3461,19 +3461,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat3:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -3482,8 +3482,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
@@ -3492,32 +3492,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(ptr addrspace(1) %o
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat3:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat3:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -3586,17 +3586,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat4:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -3604,8 +3604,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v2, v7, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -3613,19 +3613,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat4:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -3634,8 +3634,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3
@@ -3644,32 +3644,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(ptr addrspace(1) %o
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat4:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat4:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -3738,17 +3738,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat5:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -3756,8 +3756,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v2, v7, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -3765,19 +3765,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat5:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -3786,8 +3786,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3
@@ -3796,32 +3796,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(ptr addrspace(1) %o
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat5:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat5:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -3890,17 +3890,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat6:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -3908,8 +3908,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v2, v7, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -3917,19 +3917,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat6:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -3938,8 +3938,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3
@@ -3948,32 +3948,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(ptr addrspace(1) %o
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat6:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat6:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -4042,17 +4042,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat7:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -4060,8 +4060,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v2, v7, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -4069,19 +4069,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat7:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -4090,8 +4090,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3
@@ -4100,32 +4100,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(ptr addrspace(1) %o
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat7:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat7:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -4194,17 +4194,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat8:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -4212,8 +4212,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -4221,19 +4221,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat8:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -4242,8 +4242,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
@@ -4252,32 +4252,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(ptr addrspace(1) %o
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat8:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat8:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -4346,17 +4346,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat9:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -4364,8 +4364,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v2, v7, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -4373,19 +4373,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat9:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -4394,8 +4394,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3
@@ -4404,32 +4404,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(ptr addrspace(1) %o
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat9:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat9:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -4498,17 +4498,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) %
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat10:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -4516,8 +4516,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) %
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -4525,19 +4525,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) %
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat10:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -4546,8 +4546,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) %
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
@@ -4556,32 +4556,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(ptr addrspace(1) %
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat10:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat10:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -4650,17 +4650,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) %
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat11:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -4668,8 +4668,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) %
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v2, v7, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -4677,19 +4677,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) %
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat11:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -4698,8 +4698,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) %
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3
@@ -4708,32 +4708,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(ptr addrspace(1) %
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat11:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat11:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -4802,17 +4802,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) %
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat12:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -4820,8 +4820,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) %
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v2, v7, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -4829,19 +4829,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) %
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat12:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -4850,8 +4850,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) %
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3
@@ -4860,32 +4860,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(ptr addrspace(1) %
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat12:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat12:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -4954,17 +4954,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) %
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat13:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -4972,8 +4972,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) %
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v2, v7, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -4981,19 +4981,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) %
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat13:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -5002,8 +5002,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) %
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3
@@ -5012,32 +5012,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(ptr addrspace(1) %
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat13:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat13:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -5106,17 +5106,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) %
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat14:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -5124,8 +5124,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) %
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -5133,19 +5133,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) %
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat14:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -5154,8 +5154,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) %
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
@@ -5164,32 +5164,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(ptr addrspace(1) %
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat14:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat14:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -5258,17 +5258,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) %
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat15:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -5276,8 +5276,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) %
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v2, v7, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -5285,19 +5285,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) %
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat15:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -5306,8 +5306,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) %
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v2, v7, v3
@@ -5316,32 +5316,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(ptr addrspace(1) %
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat15:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat15:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v2, v1, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -5413,17 +5413,17 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat16:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -5431,8 +5431,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -5440,19 +5440,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat16:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -5461,8 +5461,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_med3_f32 v2, v7, v2, v3
@@ -5471,32 +5471,32 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat16(ptr addrspace(1) %
;
; GFX9-LABEL: v_test_global_nnans_med3_f32_pat16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_med3_f32_pat16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -5588,17 +5588,17 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1)
;
; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -5606,8 +5606,8 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1)
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_mul_f32_e32 v4, 1.0, v7
; VI-SDAG-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -5623,19 +5623,19 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1)
;
; VI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -5644,8 +5644,8 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1)
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7
@@ -5662,14 +5662,14 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1)
;
; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
@@ -5680,19 +5680,19 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1)
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_min_f32_e32 v1, v1, v3
; GFX9-NEXT: v_max_f32_e32 v1, v4, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_safe_med3_f32_pat0_multi_use0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -5701,7 +5701,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(ptr addrspace(1)
; GFX11-NEXT: v_minmax_f32 v1, v1, v2, v4
; GFX11-NEXT: global_store_b32 v[0:1], v4, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -5790,17 +5790,17 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1)
;
; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use1:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -5808,8 +5808,8 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1)
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_mul_f32_e32 v4, 1.0, v7
; VI-SDAG-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -5825,19 +5825,19 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1)
;
; VI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use1:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -5846,8 +5846,8 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1)
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7
@@ -5864,14 +5864,14 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1)
;
; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
@@ -5882,19 +5882,19 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1)
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_min_f32_e32 v1, v1, v3
; GFX9-NEXT: v_max_f32_e32 v1, v4, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use1:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -5904,21 +5904,21 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1)
; GFX11-SDAG-NEXT: v_minmax_f32 v1, v1, v2, v3
; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v4, off dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use1:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -5927,7 +5927,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(ptr addrspace(1)
; GFX11-GISEL-NEXT: v_minmax_f32 v2, v1, v2, v4
; GFX11-GISEL-NEXT: global_store_b32 v[0:1], v1, off dlc
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-GISEL-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v2, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -6016,17 +6016,17 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1)
;
; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0_multi_use2:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -6034,8 +6034,8 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1)
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_mul_f32_e32 v4, 1.0, v7
; VI-SDAG-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -6051,19 +6051,19 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1)
;
; VI-GISEL-LABEL: v_test_safe_med3_f32_pat0_multi_use2:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -6072,8 +6072,8 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1)
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7
@@ -6090,14 +6090,14 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1)
;
; GFX9-LABEL: v_test_safe_med3_f32_pat0_multi_use2:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
@@ -6108,19 +6108,19 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1)
; GFX9-NEXT: global_store_dword v[0:1], v1, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v1, v4, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_safe_med3_f32_pat0_multi_use2:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2
; GFX11-NEXT: v_max_f32_e32 v3, v3, v3
@@ -6129,7 +6129,7 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(ptr addrspace(1)
; GFX11-NEXT: v_minmax_f32 v1, v1, v2, v3
; GFX11-NEXT: global_store_b32 v[0:1], v3, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -6210,26 +6210,26 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr
;
; VI-SDAG-LABEL: v_test_safe_med3_f32_pat0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v4
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v4
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
; VI-SDAG-NEXT: flat_load_dword v6, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v4
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s10, v4
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v4
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_mul_f32_e32 v4, 1.0, v6
; VI-SDAG-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -6243,19 +6243,19 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr
;
; VI-GISEL-LABEL: v_test_safe_med3_f32_pat0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc
@@ -6264,8 +6264,8 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v4, 1.0, v7
@@ -6280,14 +6280,14 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_test_safe_med3_f32_pat0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v1, v1, v1
; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
@@ -6296,47 +6296,47 @@ define amdgpu_kernel void @v_test_safe_med3_f32_pat0(ptr addrspace(1) %out, ptr
; GFX9-NEXT: v_max_f32_e32 v2, v3, v3
; GFX9-NEXT: v_min_f32_e32 v1, v1, v2
; GFX9-NEXT: v_max_f32_e32 v1, v4, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_safe_med3_f32_pat0:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2
; GFX11-SDAG-NEXT: v_max_f32_e32 v3, v3, v3
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_maxmin_f32 v3, v1, v2, v3
; GFX11-SDAG-NEXT: v_minmax_f32 v1, v1, v2, v3
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_safe_med3_f32_pat0:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v2, v2, v2
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_min_f32_e32 v4, v1, v2
; GFX11-GISEL-NEXT: v_dual_max_f32 v1, v1, v2 :: v_dual_max_f32 v2, v3, v3
; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v2, v4
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -6411,17 +6411,17 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1)
;
; VI-SDAG-LABEL: v_nnan_inputs_missing0_med3_f32_pat0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -6429,8 +6429,8 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1)
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_add_f32_e32 v4, 1.0, v7
; VI-SDAG-NEXT: v_add_f32_e32 v2, 2.0, v2
@@ -6441,19 +6441,19 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1)
;
; VI-GISEL-LABEL: v_nnan_inputs_missing0_med3_f32_pat0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -6462,8 +6462,8 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1)
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_add_f32_e32 v4, 1.0, v7
@@ -6475,38 +6475,38 @@ define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(ptr addrspace(1)
;
; GFX9-LABEL: v_nnan_inputs_missing0_med3_f32_pat0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2
; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_nnan_inputs_missing0_med3_f32_pat0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2
; GFX11-NEXT: v_add_f32_e32 v3, 4.0, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -6586,17 +6586,17 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1)
;
; VI-SDAG-LABEL: v_nnan_inputs_missing1_med3_f32_pat0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -6604,8 +6604,8 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1)
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_add_f32_e32 v4, 1.0, v7
; VI-SDAG-NEXT: v_add_f32_e32 v2, 2.0, v2
@@ -6616,19 +6616,19 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1)
;
; VI-GISEL-LABEL: v_nnan_inputs_missing1_med3_f32_pat0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -6637,8 +6637,8 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1)
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_add_f32_e32 v4, 1.0, v7
@@ -6650,38 +6650,38 @@ define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(ptr addrspace(1)
;
; GFX9-LABEL: v_nnan_inputs_missing1_med3_f32_pat0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2
; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_nnan_inputs_missing1_med3_f32_pat0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2
; GFX11-NEXT: v_add_f32_e32 v3, 4.0, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -6761,17 +6761,17 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1)
;
; VI-SDAG-LABEL: v_nnan_inputs_missing2_med3_f32_pat0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -6779,8 +6779,8 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1)
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_add_f32_e32 v4, 1.0, v7
; VI-SDAG-NEXT: v_add_f32_e32 v2, 2.0, v2
@@ -6791,19 +6791,19 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1)
;
; VI-GISEL-LABEL: v_nnan_inputs_missing2_med3_f32_pat0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -6812,8 +6812,8 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1)
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_add_f32_e32 v4, 1.0, v7
@@ -6825,38 +6825,38 @@ define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(ptr addrspace(1)
;
; GFX9-LABEL: v_nnan_inputs_missing2_med3_f32_pat0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX9-NEXT: v_add_f32_e32 v2, 2.0, v2
; GFX9-NEXT: v_add_f32_e32 v3, 4.0, v3
; GFX9-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_nnan_inputs_missing2_med3_f32_pat0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_dual_add_f32 v1, 1.0, v1 :: v_dual_add_f32 v2, 2.0, v2
; GFX11-NEXT: v_add_f32_e32 v3, 4.0, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -6931,17 +6931,17 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa
;
; VI-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -6949,8 +6949,8 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_med3_f32 v2, -v7, v2, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -6958,19 +6958,19 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa
;
; VI-GISEL-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -6979,8 +6979,8 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v4, -1.0, v7
@@ -6990,67 +6990,67 @@ define amdgpu_kernel void @v_test_nnan_on_call_med3_f32_pat0_srcmod0(ptr addrspa
;
; GFX9-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_med3_f32 v1, -v1, v2, v3
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1
; GFX9-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_med3_f32 v1, -v1, v2, v3
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_nnan_on_call_med3_f32_pat0_srcmod0:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_max_f32_e64 v1, -v1, -v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_med3_f32 v1, v1, v2, v3
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -7126,26 +7126,26 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt
;
; VI-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v4
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v4
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
; VI-SDAG-NEXT: flat_load_dword v6, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v2, v[2:3] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v4
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s10, v4
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v4
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_min_f32_e64 v4, -v6, v2
; VI-SDAG-NEXT: v_max_f32_e32 v2, v6, v2
@@ -7156,19 +7156,19 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt
;
; VI-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v2, v[2:3] glc
@@ -7177,8 +7177,8 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v4, -1.0, v7
@@ -7191,77 +7191,77 @@ define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(pt
;
; GFX9-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-SDAG-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_min_f32_e64 v4, -v1, v2
; GFX9-SDAG-NEXT: v_max_f32_e32 v1, v1, v2
; GFX9-SDAG-NEXT: v_min_f32_e32 v1, v1, v3
; GFX9-SDAG-NEXT: v_max_f32_e32 v1, v4, v1
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-GISEL-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_max_f32_e64 v4, -v1, -v1
; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v1, v2
; GFX9-GISEL-NEXT: v_min_f32_e32 v2, v4, v2
; GFX9-GISEL-NEXT: v_min_f32_e32 v1, v1, v3
; GFX9-GISEL-NEXT: v_max_f32_e32 v1, v2, v1
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-SDAG-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_maxmin_f32 v3, v1, v2, v3
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_minmax_f32 v1, -v1, v2, v3
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-GISEL-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_max_f32_e64 v4, -v1, -v1
; GFX11-GISEL-NEXT: v_max_f32_e32 v1, v1, v2
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_min_f32_e32 v4, v4, v2
; GFX11-GISEL-NEXT: v_minmax_f32 v1, v1, v3, v4
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -7334,17 +7334,17 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out
;
; VI-SDAG-LABEL: v_test_global_nnans_min_max_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_dword v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -7352,8 +7352,8 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_max_f32_e32 v2, v7, v2
; VI-SDAG-NEXT: v_min_f32_e32 v2, v2, v3
@@ -7362,19 +7362,19 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out
;
; VI-GISEL-LABEL: v_test_global_nnans_min_max_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_dword v7, v[0:1] glc
@@ -7383,8 +7383,8 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_max_f32_e32 v2, v7, v2
@@ -7394,33 +7394,33 @@ define amdgpu_kernel void @v_test_global_nnans_min_max_f32(ptr addrspace(1) %out
;
; GFX9-LABEL: v_test_global_nnans_min_max_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_dword v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f32_e32 v1, v1, v2
; GFX9-NEXT: v_min_f32_e32 v1, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_global_nnans_min_max_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_b32 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_maxmin_f32 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -7487,15 +7487,15 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o
;
; VI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f16:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_ushort v3, v[0:1]
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_f16_e32 v2, 1.0, v3
@@ -7506,17 +7506,17 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o
;
; VI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f16:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_ushort v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_add_f16_e32 v2, 1.0, v3
@@ -7527,27 +7527,27 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o
;
; GFX9-LABEL: v_test_nnan_input_fmed3_r_i_i_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f16_e32 v1, 1.0, v1
; GFX9-NEXT: v_med3_f16 v1, v1, 2.0, 4.0
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_nnan_input_fmed3_r_i_i_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f16_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f16 v1, v1, 2.0, 4.0
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -7644,17 +7644,17 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt
;
; VI-SDAG-LABEL: v_nnan_inputs_med3_f16_pat0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v6, 1, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s5
-; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s4, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s9
+; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, s8, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s7
-; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s6, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s11
+; VI-SDAG-NEXT: v_add_u32_e32 v4, vcc, s10, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-SDAG-NEXT: flat_load_ushort v7, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -7662,8 +7662,8 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_ushort v3, v[4:5] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v6
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_add_f16_e32 v4, 1.0, v7
; VI-SDAG-NEXT: v_add_f16_e32 v2, 2.0, v2
@@ -7677,19 +7677,19 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt
;
; VI-GISEL-LABEL: v_nnan_inputs_med3_f16_pat0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v6, 1, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s8
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s9
; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s6
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s7
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s10
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s11
; VI-GISEL-NEXT: v_add_u32_e32 v4, vcc, v4, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-GISEL-NEXT: flat_load_ushort v7, v[0:1] glc
@@ -7698,8 +7698,8 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_ushort v3, v[4:5] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v6
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_add_f16_e32 v4, 1.0, v7
@@ -7714,39 +7714,39 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt
;
; GFX9-LABEL: v_nnan_inputs_med3_f16_pat0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_ushort v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_ushort v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_ushort v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_ushort v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f16_e32 v1, 1.0, v1
; GFX9-NEXT: v_add_f16_e32 v2, 2.0, v2
; GFX9-NEXT: v_add_f16_e32 v3, 4.0, v3
; GFX9-NEXT: v_med3_f16 v1, v1, v2, v3
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: v_nnan_inputs_med3_f16_pat0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_u16 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_u16 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_u16 v3, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_u16 v3, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f16_e32 v1, 1.0, v1
; GFX11-NEXT: v_add_f16_e32 v2, 2.0, v2
; GFX11-NEXT: v_add_f16_e32 v3, 4.0, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_med3_f16 v1, v1, v2, v3
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -7810,15 +7810,15 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad
;
; VI-SDAG-LABEL: two_non_inline_constant:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_f32_e32 v2, 0.5, v3
@@ -7829,17 +7829,17 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad
;
; VI-GISEL-LABEL: two_non_inline_constant:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_add_f32_e32 v2, 0.5, v3
@@ -7850,45 +7850,45 @@ define amdgpu_kernel void @two_non_inline_constant(ptr addrspace(1) %out, ptr ad
;
; GFX9-LABEL: two_non_inline_constant:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v1, 0.5, v1
; GFX9-NEXT: v_max_f32_e32 v1, 0x41000000, v1
; GFX9-NEXT: v_min_f32_e32 v1, 0x41800000, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: two_non_inline_constant:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT: s_mov_b32 s0, 0x41000000
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-SDAG-NEXT: s_mov_b32 s2, 0x41000000
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_add_f32_e32 v1, 0.5, v1
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_maxmin_f32 v1, v1, s2, 0x41800000
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: v_maxmin_f32 v1, v1, s0, 0x41800000
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: two_non_inline_constant:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x41800000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_add_f32_e32 v1, 0.5, v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_maxmin_f32 v1, v1, 0x41000000, v2
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -7952,16 +7952,16 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad
;
; VI-SDAG-LABEL: one_non_inline_constant:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x41800000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_f32_e32 v2, 0.5, v3
@@ -7974,17 +7974,17 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad
;
; VI-GISEL-LABEL: one_non_inline_constant:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x41800000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -7998,32 +7998,32 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad
;
; GFX9-LABEL: one_non_inline_constant:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0x41800000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v3, 0.5, v1
; GFX9-NEXT: v_add_f32_e32 v1, 0x41800000, v1
; GFX9-NEXT: v_med3_f32 v2, v3, 1.0, v2
-; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v2, s[4:5]
; GFX9-NEXT: global_store_dword v[0:1], v1, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: one_non_inline_constant:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v2, 0.5, v1
; GFX11-NEXT: v_add_f32_e32 v1, 0x41800000, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_med3_f32 v2, v2, 1.0, 0x41800000
-; GFX11-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v2, s[4:5]
; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_nop 0
@@ -8099,21 +8099,21 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o
;
; VI-SDAG-LABEL: two_non_inline_constant_multi_use:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-SDAG-NEXT: s_mov_b32 s0, 0x41000000
; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x41800000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: s_mov_b32 s2, 0x41000000
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_f32_e32 v2, 0.5, v3
-; VI-SDAG-NEXT: v_med3_f32 v2, v2, s2, v4
+; VI-SDAG-NEXT: v_med3_f32 v2, v2, s0, v4
; VI-SDAG-NEXT: v_add_f32_e32 v5, 0x41800000, v3
; VI-SDAG-NEXT: v_add_f32_e32 v3, 0x41000000, v3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
@@ -8125,18 +8125,18 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o
;
; VI-GISEL-LABEL: two_non_inline_constant_multi_use:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x41000000
; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x41800000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -8153,18 +8153,18 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o
;
; GFX9-SDAG-LABEL: two_non_inline_constant_multi_use:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_mov_b32 s0, 0x41000000
; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 0x41800000
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT: s_mov_b32 s2, 0x41000000
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_add_f32_e32 v3, 0.5, v1
; GFX9-SDAG-NEXT: v_add_f32_e32 v4, 0x41800000, v1
; GFX9-SDAG-NEXT: v_add_f32_e32 v1, 0x41000000, v1
-; GFX9-SDAG-NEXT: v_med3_f32 v2, v3, s2, v2
-; GFX9-SDAG-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-SDAG-NEXT: v_med3_f32 v2, v3, s0, v2
+; GFX9-SDAG-NEXT: global_store_dword v0, v2, s[4:5]
; GFX9-SDAG-NEXT: global_store_dword v[0:1], v4, off
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: global_store_dword v[0:1], v1, off
@@ -8173,18 +8173,18 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o
;
; GFX9-GISEL-LABEL: two_non_inline_constant_multi_use:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x41000000
; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x41800000
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_add_f32_e32 v4, 0.5, v1
; GFX9-GISEL-NEXT: v_add_f32_e32 v5, 0x41800000, v1
; GFX9-GISEL-NEXT: v_add_f32_e32 v1, 0x41000000, v1
; GFX9-GISEL-NEXT: v_med3_f32 v2, v4, v2, v3
-; GFX9-GISEL-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v2, s[4:5]
; GFX9-GISEL-NEXT: global_store_dword v[0:1], v5, off
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: global_store_dword v[0:1], v1, off
@@ -8193,18 +8193,18 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o
;
; GFX11-SDAG-LABEL: two_non_inline_constant_multi_use:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-SDAG-NEXT: s_mov_b32 s0, 0x41000000
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-SDAG-NEXT: s_mov_b32 s2, 0x41000000
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_add_f32_e32 v3, 0x41800000, v1
; GFX11-SDAG-NEXT: v_add_f32_e32 v2, 0.5, v1
; GFX11-SDAG-NEXT: v_add_f32_e32 v1, 0x41000000, v1
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_med3_f32 v2, v2, s2, 0x41800000
-; GFX11-SDAG-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11-SDAG-NEXT: v_med3_f32 v2, v2, s0, 0x41800000
+; GFX11-SDAG-NEXT: global_store_b32 v0, v2, s[4:5]
; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v3, off dlc
; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v1, off dlc
@@ -8215,17 +8215,17 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o
;
; GFX11-GISEL-LABEL: two_non_inline_constant_multi_use:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0x41800000 :: v_dual_add_f32 v3, 0.5, v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_med3_f32 v2, v3, 0x41000000, v2
; GFX11-GISEL-NEXT: v_add_f32_e32 v3, 0x41800000, v1
; GFX11-GISEL-NEXT: v_add_f32_e32 v1, 0x41000000, v1
-; GFX11-GISEL-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v2, s[4:5]
; GFX11-GISEL-NEXT: global_store_b32 v[0:1], v3, off dlc
; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-GISEL-NEXT: global_store_b32 v[0:1], v1, off dlc
diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll
index 7337d90b4bea6..2d179556235e9 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll
@@ -37,92 +37,92 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
;
; VI-LABEL: test_fmin3_olt_0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
+; VI-NEXT: buffer_load_dword v2, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_min3_f32 v0, v0, v1, v2
-; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fmin3_olt_0_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s14, s2
+; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s12, s2
-; GFX9-NEXT: s_mov_b32 s13, s3
-; GFX9-NEXT: s_mov_b32 s16, s4
-; GFX9-NEXT: s_mov_b32 s17, s5
-; GFX9-NEXT: s_mov_b32 s18, s10
-; GFX9-NEXT: s_mov_b32 s19, s11
-; GFX9-NEXT: s_mov_b32 s4, s6
-; GFX9-NEXT: s_mov_b32 s5, s7
-; GFX9-NEXT: s_mov_b32 s6, s10
-; GFX9-NEXT: s_mov_b32 s7, s11
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b32 s16, s8
+; GFX9-NEXT: s_mov_b32 s17, s9
+; GFX9-NEXT: s_mov_b32 s18, s2
+; GFX9-NEXT: s_mov_b32 s19, s3
+; GFX9-NEXT: s_mov_b32 s8, s10
+; GFX9-NEXT: s_mov_b32 s9, s11
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
+; GFX9-NEXT: buffer_load_dword v2, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s0
-; GFX9-NEXT: s_mov_b32 s9, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_min3_f32 v0, v0, v1, v2
-; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmin3_olt_0_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_min3_f32 v0, v0, v1, v2
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -169,92 +169,92 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
;
; VI-LABEL: test_fmin3_olt_1_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
+; VI-NEXT: buffer_load_dword v2, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_min3_f32 v0, v2, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fmin3_olt_1_f32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s14, s2
+; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s12, s2
-; GFX9-NEXT: s_mov_b32 s13, s3
-; GFX9-NEXT: s_mov_b32 s16, s4
-; GFX9-NEXT: s_mov_b32 s17, s5
-; GFX9-NEXT: s_mov_b32 s18, s10
-; GFX9-NEXT: s_mov_b32 s19, s11
-; GFX9-NEXT: s_mov_b32 s4, s6
-; GFX9-NEXT: s_mov_b32 s5, s7
-; GFX9-NEXT: s_mov_b32 s6, s10
-; GFX9-NEXT: s_mov_b32 s7, s11
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b32 s16, s8
+; GFX9-NEXT: s_mov_b32 s17, s9
+; GFX9-NEXT: s_mov_b32 s18, s2
+; GFX9-NEXT: s_mov_b32 s19, s3
+; GFX9-NEXT: s_mov_b32 s8, s10
+; GFX9-NEXT: s_mov_b32 s9, s11
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_dword v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_dword v1, off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_dword v2, off, s[4:7], 0 glc
+; GFX9-NEXT: buffer_load_dword v2, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s0
-; GFX9-NEXT: s_mov_b32 s9, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_min3_f32 v0, v2, v0, v1
-; GFX9-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmin3_olt_1_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_min3_f32 v0, v2, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -304,96 +304,96 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
;
; VI-LABEL: test_fmin3_olt_0_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: v_max_f16_e32 v1, v1, v1
; VI-NEXT: v_min_f16_e32 v0, v0, v1
; VI-NEXT: v_max_f16_e32 v1, v2, v2
; VI-NEXT: v_min_f16_e32 v0, v0, v1
-; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fmin3_olt_0_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s14, s2
+; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s12, s2
-; GFX9-NEXT: s_mov_b32 s13, s3
-; GFX9-NEXT: s_mov_b32 s16, s4
-; GFX9-NEXT: s_mov_b32 s17, s5
-; GFX9-NEXT: s_mov_b32 s18, s10
-; GFX9-NEXT: s_mov_b32 s19, s11
-; GFX9-NEXT: s_mov_b32 s4, s6
-; GFX9-NEXT: s_mov_b32 s5, s7
-; GFX9-NEXT: s_mov_b32 s6, s10
-; GFX9-NEXT: s_mov_b32 s7, s11
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b32 s16, s8
+; GFX9-NEXT: s_mov_b32 s17, s9
+; GFX9-NEXT: s_mov_b32 s18, s2
+; GFX9-NEXT: s_mov_b32 s19, s3
+; GFX9-NEXT: s_mov_b32 s8, s10
+; GFX9-NEXT: s_mov_b32 s9, s11
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; GFX9-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s0
-; GFX9-NEXT: s_mov_b32 s9, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_min3_f16 v0, v0, v1, v2
-; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0
+; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmin3_olt_0_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_min3_f16 v0, v0, v1, v2
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -444,96 +444,96 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
;
; VI-LABEL: test_fmin3_olt_1_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: v_max_f16_e32 v1, v1, v1
; VI-NEXT: v_min_f16_e32 v0, v0, v1
; VI-NEXT: v_max_f16_e32 v1, v2, v2
; VI-NEXT: v_min_f16_e32 v0, v1, v0
-; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fmin3_olt_1_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s14, s2
+; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s12, s2
-; GFX9-NEXT: s_mov_b32 s13, s3
-; GFX9-NEXT: s_mov_b32 s16, s4
-; GFX9-NEXT: s_mov_b32 s17, s5
-; GFX9-NEXT: s_mov_b32 s18, s10
-; GFX9-NEXT: s_mov_b32 s19, s11
-; GFX9-NEXT: s_mov_b32 s4, s6
-; GFX9-NEXT: s_mov_b32 s5, s7
-; GFX9-NEXT: s_mov_b32 s6, s10
-; GFX9-NEXT: s_mov_b32 s7, s11
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b32 s16, s8
+; GFX9-NEXT: s_mov_b32 s17, s9
+; GFX9-NEXT: s_mov_b32 s18, s2
+; GFX9-NEXT: s_mov_b32 s19, s3
+; GFX9-NEXT: s_mov_b32 s8, s10
+; GFX9-NEXT: s_mov_b32 s9, s11
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; GFX9-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s0
-; GFX9-NEXT: s_mov_b32 s9, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_min3_f16 v0, v2, v0, v1
-; GFX9-NEXT: buffer_store_short v0, off, s[8:11], 0
+; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmin3_olt_1_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_min3_f16 v0, v2, v0, v1
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -646,103 +646,103 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs
;
; VI-LABEL: test_fmin3_olt_0_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 glc
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_dwordx2 v[4:5], off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; VI-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; VI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
; VI-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; VI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fmin3_olt_0_f64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s14, s2
+; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s12, s2
-; GFX9-NEXT: s_mov_b32 s13, s3
-; GFX9-NEXT: s_mov_b32 s16, s4
-; GFX9-NEXT: s_mov_b32 s17, s5
-; GFX9-NEXT: s_mov_b32 s18, s10
-; GFX9-NEXT: s_mov_b32 s19, s11
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b32 s16, s8
+; GFX9-NEXT: s_mov_b32 s17, s9
+; GFX9-NEXT: s_mov_b32 s18, s2
+; GFX9-NEXT: s_mov_b32 s19, s3
; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s4, s6
-; GFX9-NEXT: s_mov_b32 s5, s7
-; GFX9-NEXT: s_mov_b32 s6, s10
-; GFX9-NEXT: s_mov_b32 s7, s11
-; GFX9-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 glc
+; GFX9-NEXT: s_mov_b32 s8, s10
+; GFX9-NEXT: s_mov_b32 s9, s11
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
+; GFX9-NEXT: buffer_load_dwordx2 v[4:5], off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s0
-; GFX9-NEXT: s_mov_b32 s9, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmin3_olt_0_f64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s6
-; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: s_mov_b32 s8, s0
+; GFX11-NEXT: s_mov_b32 s12, s10
+; GFX11-NEXT: s_mov_b32 s13, s11
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b64 v[4:5], off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -793,103 +793,103 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs
;
; VI-LABEL: test_fmin3_olt_1_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 glc
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_dwordx2 v[4:5], off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; VI-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; VI-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
; VI-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; VI-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1]
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_fmin3_olt_1_f64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_mov_b32 s14, s10
-; GFX9-NEXT: s_mov_b32 s15, s11
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s14, s2
+; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s12, s2
-; GFX9-NEXT: s_mov_b32 s13, s3
-; GFX9-NEXT: s_mov_b32 s16, s4
-; GFX9-NEXT: s_mov_b32 s17, s5
-; GFX9-NEXT: s_mov_b32 s18, s10
-; GFX9-NEXT: s_mov_b32 s19, s11
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b32 s16, s8
+; GFX9-NEXT: s_mov_b32 s17, s9
+; GFX9-NEXT: s_mov_b32 s18, s2
+; GFX9-NEXT: s_mov_b32 s19, s3
; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s4, s6
-; GFX9-NEXT: s_mov_b32 s5, s7
-; GFX9-NEXT: s_mov_b32 s6, s10
-; GFX9-NEXT: s_mov_b32 s7, s11
-; GFX9-NEXT: buffer_load_dwordx2 v[4:5], off, s[4:7], 0 glc
+; GFX9-NEXT: s_mov_b32 s8, s10
+; GFX9-NEXT: s_mov_b32 s9, s11
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
+; GFX9-NEXT: buffer_load_dwordx2 v[4:5], off, s[8:11], 0 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s0
-; GFX9-NEXT: s_mov_b32 s9, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
; GFX9-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; GFX9-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1]
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: test_fmin3_olt_1_f64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s6
-; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: s_mov_b32 s8, s0
+; GFX11-NEXT: s_mov_b32 s12, s10
+; GFX11-NEXT: s_mov_b32 s13, s11
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b64 v[4:5], off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[2:3]
; GFX11-NEXT: v_max_f64 v[2:3], v[4:5], v[4:5]
; GFX11-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1]
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
index d20c39d510364..1620ecfc20260 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
@@ -26,15 +26,15 @@ define amdgpu_kernel void @test_fmin_legacy_uge_f64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: test_fmin_legacy_uge_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_nlt_f64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -78,15 +78,15 @@ define amdgpu_kernel void @test_fmin_legacy_ugt_f64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: test_fmin_legacy_ugt_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_nle_f64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -130,15 +130,15 @@ define amdgpu_kernel void @test_fmin_legacy_ule_f64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: test_fmin_legacy_ule_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_ngt_f64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -182,15 +182,15 @@ define amdgpu_kernel void @test_fmin_legacy_ult_f64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: test_fmin_legacy_ult_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_nge_f64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -234,15 +234,15 @@ define amdgpu_kernel void @test_fmin_legacy_oge_f64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: test_fmin_legacy_oge_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_ge_f64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -286,15 +286,15 @@ define amdgpu_kernel void @test_fmin_legacy_ogt_f64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: test_fmin_legacy_ogt_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -338,15 +338,15 @@ define amdgpu_kernel void @test_fmin_legacy_ole_f64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: test_fmin_legacy_ole_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_le_f64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
@@ -390,15 +390,15 @@ define amdgpu_kernel void @test_fmin_legacy_olt_f64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: test_fmin_legacy_olt_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum.ll b/llvm/test/CodeGen/AMDGPU/fminimum.ll
index 45f6bff10f45e..0464b9af6023f 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum.ll
@@ -263,12 +263,12 @@ define amdgpu_kernel void @fminimumi_f32_move_to_valu(ptr addrspace(1) %out, ptr
; GCN: ; %bb.0:
; GCN-NEXT: s_clause 0x1
; GCN-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: global_load_b32 v1, v0, s[6:7] scope:SCOPE_SYS
; GCN-NEXT: s_wait_loadcnt 0x0
-; GCN-NEXT: global_load_b32 v2, v0, s[0:1] scope:SCOPE_SYS
+; GCN-NEXT: global_load_b32 v2, v0, s[2:3] scope:SCOPE_SYS
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: v_minimum_f32 v1, v1, v2
; GCN-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -287,12 +287,12 @@ define amdgpu_kernel void @fminimum_f16_move_to_valu(ptr addrspace(1) %out, ptr
; GCN: ; %bb.0:
; GCN-NEXT: s_clause 0x1
; GCN-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: global_load_u16 v1, v0, s[6:7] scope:SCOPE_SYS
; GCN-NEXT: s_wait_loadcnt 0x0
-; GCN-NEXT: global_load_u16 v2, v0, s[0:1] scope:SCOPE_SYS
+; GCN-NEXT: global_load_u16 v2, v0, s[2:3] scope:SCOPE_SYS
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: v_minimum_f16 v1, v1, v2
; GCN-NEXT: global_store_b16 v0, v1, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
index 98faaacf1dfb0..384ea30a77f73 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
@@ -58,24 +58,24 @@ define amdgpu_kernel void @fmul_f16(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -113,38 +113,38 @@ define amdgpu_kernel void @fmul_f16_imm_a(
;
; GFX89-LABEL: fmul_f16_imm_a:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc
; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: v_mul_f16_e32 v0, 0x4200, v0
-; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fmul_f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_mul_f16_e32 v0, 0x4200, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -180,38 +180,38 @@ define amdgpu_kernel void @fmul_f16_imm_b(
;
; GFX89-LABEL: fmul_f16_imm_b:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc
; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: v_mul_f16_e32 v0, 4.0, v0
-; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fmul_f16_imm_b:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_mul_f16_e32 v0, 4.0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -309,23 +309,23 @@ define amdgpu_kernel void @fmul_v2f16(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0
-; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: buffer_load_b32 v1, off, s[8:11], 0
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_mul_f16 v0, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -369,60 +369,60 @@ define amdgpu_kernel void @fmul_v2f16_imm_a(
;
; VI-LABEL: fmul_v2f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
; VI-NEXT: v_mov_b32_e32 v1, 0x4400
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_mul_f16_e32 v0, 0x4200, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fmul_v2f16_imm_a:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s0, 0x44004200
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s4, 0x44004200
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_mul_f16 v0, v0, s0
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, s4
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: fmul_v2f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_mul_f16 v0, 0x44004200, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -464,60 +464,60 @@ define amdgpu_kernel void @fmul_v2f16_imm_b(
;
; VI-LABEL: fmul_v2f16_imm_b:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
; VI-NEXT: v_mov_b32_e32 v1, 0x4200
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_mul_f16_e32 v0, 4.0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fmul_v2f16_imm_b:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s0, 0x42004400
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s4, 0x42004400
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_mul_f16 v0, v0, s0
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, s4
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: fmul_v2f16_imm_b:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_mul_f16 v0, 0x42004400, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -631,24 +631,24 @@ define amdgpu_kernel void @fmul_v4f16(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_mul_f16 v1, v3, v1
; GFX11-NEXT: v_pk_mul_f16 v0, v2, v0
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -701,18 +701,18 @@ define amdgpu_kernel void @fmul_v4f16_imm_a(
;
; VI-LABEL: fmul_v4f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
; VI-NEXT: v_mov_b32_e32 v2, 0x4400
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_f16_sdwa v2, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_mul_f16_e32 v1, 0x4200, v1
@@ -720,47 +720,47 @@ define amdgpu_kernel void @fmul_v4f16_imm_a(
; VI-NEXT: v_mul_f16_e32 v0, 0x4800, v0
; VI-NEXT: v_or_b32_e32 v1, v1, v2
; VI-NEXT: v_or_b32_e32 v0, v0, v3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fmul_v4f16_imm_a:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s2, 0x44004200
-; GFX9-NEXT: s_mov_b32 s3, 0x40004800
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s6, 0x44004200
+; GFX9-NEXT: s_mov_b32 s7, 0x40004800
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_mul_f16 v1, v1, s2
-; GFX9-NEXT: v_pk_mul_f16 v0, v0, s3
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: v_pk_mul_f16 v1, v1, s6
+; GFX9-NEXT: v_pk_mul_f16 v0, v0, s7
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: fmul_v4f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_mul_f16 v1, 0x44004200, v1
; GFX11-NEXT: v_pk_mul_f16 v0, 0x40004800, v0
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
index bde0dc326f9ac..25ec5b1665ce8 100644
--- a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
@@ -22,19 +22,19 @@ declare half @llvm.fabs.f16(half) #1
define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
; VI-FLUSH-LABEL: fmuladd_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
-; VI-FLUSH-NEXT: v_mov_b32_e32 v2, s4
-; VI-FLUSH-NEXT: v_mov_b32_e32 v3, s5
-; VI-FLUSH-NEXT: v_mov_b32_e32 v4, s6
-; VI-FLUSH-NEXT: v_mov_b32_e32 v5, s7
+; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s6
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s7
+; VI-FLUSH-NEXT: v_mov_b32_e32 v2, s8
+; VI-FLUSH-NEXT: v_mov_b32_e32 v3, s9
+; VI-FLUSH-NEXT: v_mov_b32_e32 v4, s10
+; VI-FLUSH-NEXT: v_mov_b32_e32 v5, s11
; VI-FLUSH-NEXT: flat_load_ushort v6, v[0:1]
; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3]
; VI-FLUSH-NEXT: flat_load_ushort v3, v[4:5]
-; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
+; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s4
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s5
; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
; VI-FLUSH-NEXT: v_mac_f16_e32 v3, v6, v2
; VI-FLUSH-NEXT: flat_store_short v[0:1], v3
@@ -42,19 +42,19 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) %
;
; VI-DENORM-LABEL: fmuladd_f16:
; VI-DENORM: ; %bb.0:
-; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-DENORM-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2
-; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3
-; VI-DENORM-NEXT: v_mov_b32_e32 v2, s4
-; VI-DENORM-NEXT: v_mov_b32_e32 v3, s5
-; VI-DENORM-NEXT: v_mov_b32_e32 v4, s6
-; VI-DENORM-NEXT: v_mov_b32_e32 v5, s7
+; VI-DENORM-NEXT: v_mov_b32_e32 v0, s6
+; VI-DENORM-NEXT: v_mov_b32_e32 v1, s7
+; VI-DENORM-NEXT: v_mov_b32_e32 v2, s8
+; VI-DENORM-NEXT: v_mov_b32_e32 v3, s9
+; VI-DENORM-NEXT: v_mov_b32_e32 v4, s10
+; VI-DENORM-NEXT: v_mov_b32_e32 v5, s11
; VI-DENORM-NEXT: flat_load_ushort v6, v[0:1]
; VI-DENORM-NEXT: flat_load_ushort v2, v[2:3]
; VI-DENORM-NEXT: flat_load_ushort v3, v[4:5]
-; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0
-; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
+; VI-DENORM-NEXT: v_mov_b32_e32 v0, s4
+; VI-DENORM-NEXT: v_mov_b32_e32 v1, s5
; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
; VI-DENORM-NEXT: v_fma_f16 v2, v6, v2, v3
; VI-DENORM-NEXT: flat_store_short v[0:1], v2
@@ -62,65 +62,65 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX10-FLUSH-LABEL: fmuladd_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-FLUSH-NEXT: s_clause 0x2
-; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3]
-; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[4:5]
-; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7]
+; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7]
+; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[8:9]
+; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[10:11]
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1)
; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3
-; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-LABEL: fmuladd_f16:
; GFX10-DENORM: ; %bb.0:
-; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DENORM-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DENORM-NEXT: s_clause 0x2
-; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[2:3]
-; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[4:5]
-; GFX10-DENORM-NEXT: global_load_ushort v3, v0, s[6:7]
+; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[6:7]
+; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[8:9]
+; GFX10-DENORM-NEXT: global_load_ushort v3, v0, s[10:11]
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-NEXT: v_fmac_f16_e32 v3, v1, v2
-; GFX10-DENORM-NEXT: global_store_short v0, v3, s[0:1]
+; GFX10-DENORM-NEXT: global_store_short v0, v3, s[4:5]
; GFX10-DENORM-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fmuladd_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-FLUSH-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FLUSH-NEXT: s_clause 0x2
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[4:5]
-; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[6:7]
+; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[6:7]
+; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[8:9]
+; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[10:11]
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(1)
; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-LABEL: fmuladd_f16:
; GFX11-DENORM: ; %bb.0:
-; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DENORM-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0
; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DENORM-NEXT: s_clause 0x2
-; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[4:5]
-; GFX11-DENORM-NEXT: global_load_u16 v3, v0, s[6:7]
+; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[6:7]
+; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[8:9]
+; GFX11-DENORM-NEXT: global_load_u16 v3, v0, s[10:11]
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-NEXT: v_fmac_f16_e32 v3, v1, v2
-; GFX11-DENORM-NEXT: global_store_b16 v0, v3, s[0:1]
+; GFX11-DENORM-NEXT: global_store_b16 v0, v3, s[4:5]
; GFX11-DENORM-NEXT: s_nop 0
; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-NEXT: s_endpgm
@@ -136,19 +136,19 @@ define amdgpu_kernel void @fmuladd_f16(ptr addrspace(1) %out, ptr addrspace(1) %
define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
; VI-FLUSH-LABEL: fmul_fadd_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
-; VI-FLUSH-NEXT: v_mov_b32_e32 v2, s4
-; VI-FLUSH-NEXT: v_mov_b32_e32 v3, s5
-; VI-FLUSH-NEXT: v_mov_b32_e32 v4, s6
-; VI-FLUSH-NEXT: v_mov_b32_e32 v5, s7
+; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s6
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s7
+; VI-FLUSH-NEXT: v_mov_b32_e32 v2, s8
+; VI-FLUSH-NEXT: v_mov_b32_e32 v3, s9
+; VI-FLUSH-NEXT: v_mov_b32_e32 v4, s10
+; VI-FLUSH-NEXT: v_mov_b32_e32 v5, s11
; VI-FLUSH-NEXT: flat_load_ushort v6, v[0:1]
; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3]
; VI-FLUSH-NEXT: flat_load_ushort v3, v[4:5]
-; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
+; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s4
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s5
; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
; VI-FLUSH-NEXT: v_mac_f16_e32 v3, v6, v2
; VI-FLUSH-NEXT: flat_store_short v[0:1], v3
@@ -156,19 +156,19 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1)
;
; VI-DENORM-CONTRACT-LABEL: fmul_fadd_f16:
; VI-DENORM-CONTRACT: ; %bb.0:
-; VI-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, s2
-; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3
-; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v2, s4
-; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v3, s5
-; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v4, s6
-; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v5, s7
+; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, s6
+; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s7
+; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v2, s8
+; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v3, s9
+; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v4, s10
+; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v5, s11
; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v6, v[0:1]
; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v2, v[2:3]
; VI-DENORM-CONTRACT-NEXT: flat_load_ushort v3, v[4:5]
-; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, s0
-; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1
+; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, s4
+; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s5
; VI-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; VI-DENORM-CONTRACT-NEXT: v_fma_f16 v2, v6, v2, v3
; VI-DENORM-CONTRACT-NEXT: flat_store_short v[0:1], v2
@@ -176,100 +176,100 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX10-FLUSH-LABEL: fmul_fadd_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-FLUSH-NEXT: s_clause 0x2
-; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3]
-; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[4:5]
-; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7]
+; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7]
+; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[8:9]
+; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[10:11]
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1)
; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3
-; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-STRICT-LABEL: fmul_fadd_f16:
; GFX10-DENORM-STRICT: ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-DENORM-STRICT-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DENORM-STRICT-NEXT: s_clause 0x2
-; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3]
-; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[4:5]
-; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[6:7]
+; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[6:7]
+; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[8:9]
+; GFX10-DENORM-STRICT-NEXT: global_load_ushort v3, v0, s[10:11]
; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(1)
; GFX10-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2
; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v3
-; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-DENORM-STRICT-NEXT: s_endpgm
;
; GFX10-DENORM-CONTRACT-LABEL: fmul_fadd_f16:
; GFX10-DENORM-CONTRACT: ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DENORM-CONTRACT-NEXT: s_clause 0x2
-; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3]
-; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[4:5]
-; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[6:7]
+; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[6:7]
+; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[8:9]
+; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v3, v0, s[10:11]
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v3, v1, v2
-; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v3, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v3, s[4:5]
; GFX10-DENORM-CONTRACT-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fmul_fadd_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-FLUSH-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FLUSH-NEXT: s_clause 0x2
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[4:5]
-; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[6:7]
+; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[6:7]
+; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[8:9]
+; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[10:11]
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(1)
; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-STRICT-LABEL: fmul_fadd_f16:
; GFX11-DENORM-STRICT: ; %bb.0:
-; GFX11-DENORM-STRICT-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DENORM-STRICT-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-DENORM-STRICT-NEXT: v_mov_b32_e32 v0, 0
; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DENORM-STRICT-NEXT: s_clause 0x2
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[4:5]
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v3, v0, s[6:7]
+; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[6:7]
+; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[8:9]
+; GFX11-DENORM-STRICT-NEXT: global_load_u16 v3, v0, s[10:11]
; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(1)
; GFX11-DENORM-STRICT-NEXT: v_mul_f16_e32 v1, v1, v2
; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v3
-; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-DENORM-STRICT-NEXT: s_nop 0
; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-STRICT-NEXT: s_endpgm
;
; GFX11-DENORM-CONTRACT-LABEL: fmul_fadd_f16:
; GFX11-DENORM-CONTRACT: ; %bb.0:
-; GFX11-DENORM-CONTRACT-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DENORM-CONTRACT-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v0, 0
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DENORM-CONTRACT-NEXT: s_clause 0x2
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[4:5]
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v3, v0, s[6:7]
+; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[6:7]
+; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[8:9]
+; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v3, v0, s[10:11]
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v3, v1, v2
-; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v3, s[0:1]
+; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v3, s[4:5]
; GFX11-DENORM-CONTRACT-NEXT: s_nop 0
; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-CONTRACT-NEXT: s_endpgm
@@ -286,19 +286,19 @@ define amdgpu_kernel void @fmul_fadd_f16(ptr addrspace(1) %out, ptr addrspace(1)
define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
; VI-FLUSH-LABEL: fmul_fadd_contract_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
-; VI-FLUSH-NEXT: v_mov_b32_e32 v2, s4
-; VI-FLUSH-NEXT: v_mov_b32_e32 v3, s5
-; VI-FLUSH-NEXT: v_mov_b32_e32 v4, s6
-; VI-FLUSH-NEXT: v_mov_b32_e32 v5, s7
+; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s6
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s7
+; VI-FLUSH-NEXT: v_mov_b32_e32 v2, s8
+; VI-FLUSH-NEXT: v_mov_b32_e32 v3, s9
+; VI-FLUSH-NEXT: v_mov_b32_e32 v4, s10
+; VI-FLUSH-NEXT: v_mov_b32_e32 v5, s11
; VI-FLUSH-NEXT: flat_load_ushort v6, v[0:1]
; VI-FLUSH-NEXT: flat_load_ushort v2, v[2:3]
; VI-FLUSH-NEXT: flat_load_ushort v3, v[4:5]
-; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
+; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s4
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s5
; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
; VI-FLUSH-NEXT: v_mac_f16_e32 v3, v6, v2
; VI-FLUSH-NEXT: flat_store_short v[0:1], v3
@@ -306,19 +306,19 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add
;
; VI-DENORM-LABEL: fmul_fadd_contract_f16:
; VI-DENORM: ; %bb.0:
-; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-DENORM-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2
-; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3
-; VI-DENORM-NEXT: v_mov_b32_e32 v2, s4
-; VI-DENORM-NEXT: v_mov_b32_e32 v3, s5
-; VI-DENORM-NEXT: v_mov_b32_e32 v4, s6
-; VI-DENORM-NEXT: v_mov_b32_e32 v5, s7
+; VI-DENORM-NEXT: v_mov_b32_e32 v0, s6
+; VI-DENORM-NEXT: v_mov_b32_e32 v1, s7
+; VI-DENORM-NEXT: v_mov_b32_e32 v2, s8
+; VI-DENORM-NEXT: v_mov_b32_e32 v3, s9
+; VI-DENORM-NEXT: v_mov_b32_e32 v4, s10
+; VI-DENORM-NEXT: v_mov_b32_e32 v5, s11
; VI-DENORM-NEXT: flat_load_ushort v6, v[0:1]
; VI-DENORM-NEXT: flat_load_ushort v2, v[2:3]
; VI-DENORM-NEXT: flat_load_ushort v3, v[4:5]
-; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0
-; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
+; VI-DENORM-NEXT: v_mov_b32_e32 v0, s4
+; VI-DENORM-NEXT: v_mov_b32_e32 v1, s5
; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
; VI-DENORM-NEXT: v_fma_f16 v2, v6, v2, v3
; VI-DENORM-NEXT: flat_store_short v[0:1], v2
@@ -326,65 +326,65 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add
;
; GFX10-FLUSH-LABEL: fmul_fadd_contract_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-FLUSH-NEXT: v_mov_b32_e32 v0, 0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-FLUSH-NEXT: s_clause 0x2
-; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3]
-; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[4:5]
-; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[6:7]
+; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[6:7]
+; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[8:9]
+; GFX10-FLUSH-NEXT: global_load_ushort v3, v0, s[10:11]
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1)
; GFX10-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3
-; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-LABEL: fmul_fadd_contract_f16:
; GFX10-DENORM: ; %bb.0:
-; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DENORM-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-DENORM-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DENORM-NEXT: s_clause 0x2
-; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[2:3]
-; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[4:5]
-; GFX10-DENORM-NEXT: global_load_ushort v3, v0, s[6:7]
+; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[6:7]
+; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[8:9]
+; GFX10-DENORM-NEXT: global_load_ushort v3, v0, s[10:11]
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-NEXT: v_fmac_f16_e32 v3, v1, v2
-; GFX10-DENORM-NEXT: global_store_short v0, v3, s[0:1]
+; GFX10-DENORM-NEXT: global_store_short v0, v3, s[4:5]
; GFX10-DENORM-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fmul_fadd_contract_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-FLUSH-NEXT: v_mov_b32_e32 v0, 0
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FLUSH-NEXT: s_clause 0x2
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[4:5]
-; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[6:7]
+; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[6:7]
+; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[8:9]
+; GFX11-FLUSH-NEXT: global_load_u16 v3, v0, s[10:11]
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(1)
; GFX11-FLUSH-NEXT: v_mul_f16_e32 v1, v1, v2
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v3
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-LABEL: fmul_fadd_contract_f16:
; GFX11-DENORM: ; %bb.0:
-; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DENORM-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-DENORM-NEXT: v_mov_b32_e32 v0, 0
; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DENORM-NEXT: s_clause 0x2
-; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[2:3]
-; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[4:5]
-; GFX11-DENORM-NEXT: global_load_u16 v3, v0, s[6:7]
+; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[6:7]
+; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[8:9]
+; GFX11-DENORM-NEXT: global_load_u16 v3, v0, s[10:11]
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-NEXT: v_fmac_f16_e32 v3, v1, v2
-; GFX11-DENORM-NEXT: global_store_b16 v0, v3, s[0:1]
+; GFX11-DENORM-NEXT: global_store_b16 v0, v3, s[4:5]
; GFX11-DENORM-NEXT: s_nop 0
; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-NEXT: s_endpgm
@@ -401,11 +401,11 @@ define amdgpu_kernel void @fmul_fadd_contract_f16(ptr addrspace(1) %out, ptr add
define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; VI-FLUSH-LABEL: fmuladd_2.0_a_b_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
-; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -419,11 +419,11 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp
;
; VI-DENORM-LABEL: fmuladd_2.0_a_b_f16:
; VI-DENORM: ; %bb.0:
-; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
-; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3
+; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-DENORM-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-DENORM-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-DENORM-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -437,59 +437,59 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp
;
; GFX10-FLUSH-LABEL: fmuladd_2.0_a_b_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v2
-; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-LABEL: fmuladd_2.0_a_b_f16:
; GFX10-DENORM: ; %bb.0:
-; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-NEXT: v_fmac_f16_e32 v2, 2.0, v1
-; GFX10-DENORM-NEXT: global_store_short v0, v2, s[0:1]
+; GFX10-DENORM-NEXT: global_store_short v0, v2, s[2:3]
; GFX10-DENORM-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fmuladd_2.0_a_b_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v2
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-LABEL: fmuladd_2.0_a_b_f16:
; GFX11-DENORM: ; %bb.0:
-; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, 2.0, v1
-; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[2:3]
; GFX11-DENORM-NEXT: s_nop 0
; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-NEXT: s_endpgm
@@ -509,11 +509,11 @@ define amdgpu_kernel void @fmuladd_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrsp
define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; VI-FLUSH-LABEL: fmuladd_a_2.0_b_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
-; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -527,11 +527,11 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp
;
; VI-DENORM-LABEL: fmuladd_a_2.0_b_f16:
; VI-DENORM: ; %bb.0:
-; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
-; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3
+; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-DENORM-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-DENORM-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-DENORM-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -545,59 +545,59 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp
;
; GFX10-FLUSH-LABEL: fmuladd_a_2.0_b_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v2
-; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-LABEL: fmuladd_a_2.0_b_f16:
; GFX10-DENORM: ; %bb.0:
-; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-NEXT: v_fmac_f16_e32 v2, 2.0, v1
-; GFX10-DENORM-NEXT: global_store_short v0, v2, s[0:1]
+; GFX10-DENORM-NEXT: global_store_short v0, v2, s[2:3]
; GFX10-DENORM-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fmuladd_a_2.0_b_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v2
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-LABEL: fmuladd_a_2.0_b_f16:
; GFX11-DENORM: ; %bb.0:
-; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, 2.0, v1
-; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[2:3]
; GFX11-DENORM-NEXT: s_nop 0
; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-NEXT: s_endpgm
@@ -617,11 +617,11 @@ define amdgpu_kernel void @fmuladd_a_2.0_b_f16(ptr addrspace(1) %out, ptr addrsp
define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out,
; VI-FLUSH-LABEL: fadd_a_a_b_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
-; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -635,11 +635,11 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out,
;
; VI-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16:
; VI-DENORM-CONTRACT: ; %bb.0:
-; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1
-; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3
+; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -653,90 +653,90 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out,
;
; GFX10-FLUSH-LABEL: fadd_a_a_b_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v2
-; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-STRICT-LABEL: fadd_a_a_b_f16:
; GFX10-DENORM-STRICT: ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v2
-; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-DENORM-STRICT-NEXT: s_endpgm
;
; GFX10-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16:
; GFX10-DENORM-CONTRACT: ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v2, 2.0, v1
-; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v2, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v2, s[2:3]
; GFX10-DENORM-CONTRACT-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fadd_a_a_b_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v2
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-STRICT-LABEL: fadd_a_a_b_f16:
; GFX11-DENORM-STRICT: ; %bb.0:
-; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v2
-; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-DENORM-STRICT-NEXT: s_nop 0
; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-STRICT-NEXT: s_endpgm
;
; GFX11-DENORM-CONTRACT-LABEL: fadd_a_a_b_f16:
; GFX11-DENORM-CONTRACT: ; %bb.0:
-; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v2, 2.0, v1
-; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v2, s[2:3]
; GFX11-DENORM-CONTRACT-NEXT: s_nop 0
; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-CONTRACT-NEXT: s_endpgm
@@ -759,11 +759,11 @@ define amdgpu_kernel void @fadd_a_a_b_f16(ptr addrspace(1) %out,
define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out,
; VI-FLUSH-LABEL: fadd_b_a_a_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
-; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -777,11 +777,11 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out,
;
; VI-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16:
; VI-DENORM-CONTRACT: ; %bb.0:
-; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1
-; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3
+; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -795,90 +795,90 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out,
;
; GFX10-FLUSH-LABEL: fadd_b_a_a_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v2, v1
-; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-STRICT-LABEL: fadd_b_a_a_f16:
; GFX10-DENORM-STRICT: ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v2, v1
-; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-DENORM-STRICT-NEXT: s_endpgm
;
; GFX10-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16:
; GFX10-DENORM-CONTRACT: ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v2, 2.0, v1
-; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v2, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v2, s[2:3]
; GFX10-DENORM-CONTRACT-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fadd_b_a_a_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v2, v1
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-STRICT-LABEL: fadd_b_a_a_f16:
; GFX11-DENORM-STRICT: ; %bb.0:
-; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v2, v1
-; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-DENORM-STRICT-NEXT: s_nop 0
; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-STRICT-NEXT: s_endpgm
;
; GFX11-DENORM-CONTRACT-LABEL: fadd_b_a_a_f16:
; GFX11-DENORM-CONTRACT: ; %bb.0:
-; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v2, 2.0, v1
-; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v2, s[2:3]
; GFX11-DENORM-CONTRACT-NEXT: s_nop 0
; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-CONTRACT-NEXT: s_endpgm
@@ -901,11 +901,11 @@ define amdgpu_kernel void @fadd_b_a_a_f16(ptr addrspace(1) %out,
define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; VI-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
-; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -919,11 +919,11 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad
;
; VI-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16:
; VI-DENORM: ; %bb.0:
-; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
-; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3
+; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-DENORM-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-DENORM-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-DENORM-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -937,59 +937,59 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad
;
; GFX10-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v2, v1
-; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16:
; GFX10-DENORM: ; %bb.0:
-; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-NEXT: v_fmac_f16_e32 v2, -2.0, v1
-; GFX10-DENORM-NEXT: global_store_short v0, v2, s[0:1]
+; GFX10-DENORM-NEXT: global_store_short v0, v2, s[2:3]
; GFX10-DENORM-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fmuladd_neg_2.0_a_b_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v2, v1
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-LABEL: fmuladd_neg_2.0_a_b_f16:
; GFX11-DENORM: ; %bb.0:
-; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, -2.0, v1
-; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[2:3]
; GFX11-DENORM-NEXT: s_nop 0
; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-NEXT: s_endpgm
@@ -1009,11 +1009,11 @@ define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(ptr addrspace(1) %out, ptr ad
define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; VI-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
-; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -1027,11 +1027,11 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt
;
; VI-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
; VI-DENORM: ; %bb.0:
-; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
-; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3
+; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-DENORM-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-DENORM-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-DENORM-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -1045,59 +1045,59 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt
;
; GFX10-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v2, v1
-; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
; GFX10-DENORM: ; %bb.0:
-; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-NEXT: v_fmac_f16_e32 v2, 2.0, v1
-; GFX10-DENORM-NEXT: global_store_short v0, v2, s[0:1]
+; GFX10-DENORM-NEXT: global_store_short v0, v2, s[2:3]
; GFX10-DENORM-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v2, v1
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-LABEL: fmuladd_neg_2.0_neg_a_b_f16:
; GFX11-DENORM: ; %bb.0:
-; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, 2.0, v1
-; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[2:3]
; GFX11-DENORM-NEXT: s_nop 0
; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-NEXT: s_endpgm
@@ -1119,11 +1119,11 @@ define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(ptr addrspace(1) %out, pt
define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; VI-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
-; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -1137,11 +1137,11 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad
;
; VI-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16:
; VI-DENORM: ; %bb.0:
-; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
-; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3
+; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-DENORM-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-DENORM-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-DENORM-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -1155,59 +1155,59 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad
;
; GFX10-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v2, v1
-; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16:
; GFX10-DENORM: ; %bb.0:
-; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-NEXT: v_fmac_f16_e32 v2, -2.0, v1
-; GFX10-DENORM-NEXT: global_store_short v0, v2, s[0:1]
+; GFX10-DENORM-NEXT: global_store_short v0, v2, s[2:3]
; GFX10-DENORM-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fmuladd_2.0_neg_a_b_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v2, v1
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-LABEL: fmuladd_2.0_neg_a_b_f16:
; GFX11-DENORM: ; %bb.0:
-; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, -2.0, v1
-; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-NEXT: global_store_b16 v0, v2, s[2:3]
; GFX11-DENORM-NEXT: s_nop 0
; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-NEXT: s_endpgm
@@ -1229,11 +1229,11 @@ define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(ptr addrspace(1) %out, ptr ad
define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; VI-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
-; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -1247,11 +1247,11 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad
;
; VI-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16:
; VI-DENORM: ; %bb.0:
-; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-NEXT: v_mov_b32_e32 v1, s1
-; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3
+; VI-DENORM-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-DENORM-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-DENORM-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-DENORM-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -1265,59 +1265,59 @@ define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(ptr addrspace(1) %out, ptr ad
;
; GFX10-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v2
-; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16:
; GFX10-DENORM: ; %bb.0:
-; GFX10-DENORM-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-NEXT: v_fma_f16 v1, v1, 2.0, -v2
-; GFX10-DENORM-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-DENORM-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fmuladd_2.0_a_neg_b_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v2
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-LABEL: fmuladd_2.0_a_neg_b_f16:
; GFX11-DENORM: ; %bb.0:
-; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-NEXT: v_fma_f16 v1, v1, 2.0, -v2
-; GFX11-DENORM-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-DENORM-NEXT: s_nop 0
; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-NEXT: s_endpgm
@@ -2358,11 +2358,11 @@ define amdgpu_kernel void @mad_fabs_sub_f16(ptr addrspace(1) noalias nocapture %
define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; VI-FLUSH-LABEL: fsub_c_fadd_a_a_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
-; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -2376,11 +2376,11 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp
;
; VI-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16:
; VI-DENORM-CONTRACT: ; %bb.0:
-; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1
-; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3
+; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -2394,90 +2394,90 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp
;
; GFX10-FLUSH-LABEL: fsub_c_fadd_a_a_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v2, v1
-; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-STRICT-LABEL: fsub_c_fadd_a_a_f16:
; GFX10-DENORM-STRICT: ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v2, v1
-; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-DENORM-STRICT-NEXT: s_endpgm
;
; GFX10-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16:
; GFX10-DENORM-CONTRACT: ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v2, -2.0, v1
-; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v2, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v2, s[2:3]
; GFX10-DENORM-CONTRACT-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fsub_c_fadd_a_a_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v2, v1
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-STRICT-LABEL: fsub_c_fadd_a_a_f16:
; GFX11-DENORM-STRICT: ; %bb.0:
-; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v2, v1
-; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-DENORM-STRICT-NEXT: s_nop 0
; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-STRICT-NEXT: s_endpgm
;
; GFX11-DENORM-CONTRACT-LABEL: fsub_c_fadd_a_a_f16:
; GFX11-DENORM-CONTRACT: ; %bb.0:
-; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-CONTRACT-NEXT: v_fmac_f16_e32 v2, -2.0, v1
-; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v2, s[2:3]
; GFX11-DENORM-CONTRACT-NEXT: s_nop 0
; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-CONTRACT-NEXT: s_endpgm
@@ -2499,11 +2499,11 @@ define amdgpu_kernel void @fsub_c_fadd_a_a_f16(ptr addrspace(1) %out, ptr addrsp
define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; VI-FLUSH-LABEL: fsub_fadd_a_a_c_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s1
-; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
+; VI-FLUSH-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-FLUSH-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-FLUSH-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -2517,11 +2517,11 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp
;
; VI-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16:
; VI-DENORM-CONTRACT: ; %bb.0:
-; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s1
-; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-DENORM-CONTRACT-NEXT: v_mov_b32_e32 v1, s3
+; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-DENORM-CONTRACT-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; VI-DENORM-CONTRACT-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -2535,90 +2535,90 @@ define amdgpu_kernel void @fsub_fadd_a_a_c_f16(ptr addrspace(1) %out, ptr addrsp
;
; GFX10-FLUSH-LABEL: fsub_fadd_a_a_c_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-FLUSH-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v2
-; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-FLUSH-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-STRICT-LABEL: fsub_fadd_a_a_c_f16:
; GFX10-DENORM-STRICT: ; %bb.0:
-; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-STRICT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-STRICT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-STRICT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1
; GFX10-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v1, v2
-; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-STRICT-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-DENORM-STRICT-NEXT: s_endpgm
;
; GFX10-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16:
; GFX10-DENORM-CONTRACT: ; %bb.0:
-; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-DENORM-CONTRACT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 glc dlc
+; GFX10-DENORM-CONTRACT-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc dlc
; GFX10-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, 2.0, -v2
-; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-DENORM-CONTRACT-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-DENORM-CONTRACT-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fsub_fadd_a_a_c_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-FLUSH-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_sub_f16_e32 v1, v1, v2
-; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-FLUSH-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-STRICT-LABEL: fsub_fadd_a_a_c_f16:
; GFX11-DENORM-STRICT: ; %bb.0:
-; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-STRICT-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-STRICT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-STRICT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-STRICT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-STRICT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-STRICT-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-STRICT-NEXT: v_add_f16_e32 v1, v1, v1
; GFX11-DENORM-STRICT-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DENORM-STRICT-NEXT: v_sub_f16_e32 v1, v1, v2
-; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-STRICT-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-DENORM-STRICT-NEXT: s_nop 0
; GFX11-DENORM-STRICT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-STRICT-NEXT: s_endpgm
;
; GFX11-DENORM-CONTRACT-LABEL: fsub_fadd_a_a_c_f16:
; GFX11-DENORM-CONTRACT: ; %bb.0:
-; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-DENORM-CONTRACT-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-DENORM-CONTRACT-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[0:1] glc dlc
+; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[0:1] offset:2 glc dlc
+; GFX11-DENORM-CONTRACT-NEXT: global_load_u16 v2, v0, s[2:3] offset:2 glc dlc
; GFX11-DENORM-CONTRACT-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-CONTRACT-NEXT: v_fma_f16 v1, v1, 2.0, -v2
-; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-DENORM-CONTRACT-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-DENORM-CONTRACT-NEXT: s_nop 0
; GFX11-DENORM-CONTRACT-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-CONTRACT-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
index ce5bb6617d9f4..997db917d847c 100644
--- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
@@ -41,24 +41,24 @@ define amdgpu_kernel void @fnearbyint_f16(ptr addrspace(1) %out, half %in) #1 {
;
; VI-LABEL: fnearbyint_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rndne_f16_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_rndne_f16_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fnearbyint_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_rndne_f16_e32 v1, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: v_rndne_f16_e32 v1, s4
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -81,24 +81,24 @@ define amdgpu_kernel void @fnearbyint_f32(ptr addrspace(1) %out, float %in) #1 {
;
; VI-LABEL: fnearbyint_f32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rndne_f32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_rndne_f32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fnearbyint_f32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_rndne_f32_e32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_rndne_f32_e32 v1, s4
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -168,14 +168,14 @@ define amdgpu_kernel void @fnearbyint_v4f32(ptr addrspace(1) %out, <4 x float> %
; VI-LABEL: fnearbyint_v4f32:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_rndne_f32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_rndne_f32_e32 v2, s6
; VI-NEXT: v_rndne_f32_e32 v1, s5
; VI-NEXT: v_rndne_f32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -183,14 +183,14 @@ define amdgpu_kernel void @fnearbyint_v4f32(ptr addrspace(1) %out, <4 x float> %
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rndne_f32_e32 v3, s7
; GFX11-NEXT: v_rndne_f32_e32 v2, s6
; GFX11-NEXT: v_rndne_f32_e32 v1, s5
; GFX11-NEXT: v_rndne_f32_e32 v0, s4
-; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -237,21 +237,21 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) {
;
; VI-LABEL: nearbyint_f64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rndne_f64_e32 v[0:1], s[2:3]
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_rndne_f64_e32 v[0:1], s[6:7]
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: nearbyint_f64:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_rndne_f64_e32 v[0:1], s[2:3]
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_rndne_f64_e32 v[0:1], s[6:7]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -309,12 +309,12 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> %
; VI-LABEL: nearbyint_v2f64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_rndne_f64_e32 v[2:3], s[6:7]
; VI-NEXT: v_rndne_f64_e32 v[0:1], s[4:5]
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -322,12 +322,12 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> %
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rndne_f64_e32 v[2:3], s[6:7]
; GFX11-NEXT: v_rndne_f64_e32 v[0:1], s[4:5]
-; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -406,18 +406,18 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> %
; VI-LABEL: nearbyint_v4f64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_rndne_f64_e32 v[6:7], s[10:11]
; VI-NEXT: v_rndne_f64_e32 v[4:5], s[8:9]
; VI-NEXT: v_rndne_f64_e32 v[2:3], s[6:7]
; VI-NEXT: v_rndne_f64_e32 v[0:1], s[4:5]
-; VI-NEXT: s_add_u32 s2, s0, 16
-; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v11, s3
-; VI-NEXT: v_mov_b32_e32 v9, s1
-; VI-NEXT: v_mov_b32_e32 v10, s2
-; VI-NEXT: v_mov_b32_e32 v8, s0
+; VI-NEXT: s_add_u32 s0, s2, 16
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: v_mov_b32_e32 v11, s1
+; VI-NEXT: v_mov_b32_e32 v9, s3
+; VI-NEXT: v_mov_b32_e32 v10, s0
+; VI-NEXT: v_mov_b32_e32 v8, s2
; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; VI-NEXT: s_endpgm
@@ -426,7 +426,7 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> %
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x44
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v8, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_rndne_f64_e32 v[6:7], s[10:11]
@@ -434,8 +434,8 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> %
; GFX11-NEXT: v_rndne_f64_e32 v[2:3], s[6:7]
; GFX11-NEXT: v_rndne_f64_e32 v[0:1], s[4:5]
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
-; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b128 v8, v[4:7], s[2:3] offset:16
+; GFX11-NEXT: global_store_b128 v8, v[0:3], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
index 277dc01ccd99c..c19f7d11ecfc6 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -3036,21 +3036,21 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f64(double %arg, i1
;
; VI-LABEL: s_fneg_select_infloop_regression_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dword s6, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; VI-NEXT: v_bfrev_b32_e32 v0, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitcmp1_b32 s4, 0
-; VI-NEXT: s_cselect_b64 s[4:5], -1, 0
+; VI-NEXT: s_bitcmp1_b32 s6, 0
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_and_b64 s[6:7], s[4:5], exec
-; VI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[4:5]
+; VI-NEXT: s_and_b64 s[6:7], s[0:1], exec
+; VI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[0:1]
; VI-NEXT: s_cselect_b32 s2, 0, s2
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[4:5]
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
%i = select i1 %arg1, double 0.0, double %arg
@@ -3096,17 +3096,17 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_f16(half %arg, i1 %a
;
; VI-LABEL: s_fneg_select_infloop_regression_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; VI-NEXT: s_load_dword s4, s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitcmp1_b32 s2, 16
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
-; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[2:3]
+; VI-NEXT: s_bitcmp1_b32 s4, 16
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1]
; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; VI-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
%i = select i1 %arg1, half 0.0, half %arg
@@ -3236,19 +3236,19 @@ define amdgpu_kernel void @s_fneg_select_infloop_regression_v2f32(<2 x float> %a
; VI-LABEL: s_fneg_select_infloop_regression_v2f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_bfrev_b32_e32 v0, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitcmp1_b32 s6, 0
; VI-NEXT: v_mov_b32_e32 v1, s4
-; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
-; VI-NEXT: v_cndmask_b32_e64 v2, -v1, v0, s[2:3]
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: v_cndmask_b32_e64 v2, -v1, v0, s[0:1]
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[2:3]
-; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[2:3]
-; VI-NEXT: v_cndmask_b32_e64 v0, v2, 0, s[2:3]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_cndmask_b32_e64 v0, -v1, v0, s[0:1]
+; VI-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[0:1]
+; VI-NEXT: v_cndmask_b32_e64 v0, v2, 0, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
%i = select i1 %arg1, <2 x float> zeroinitializer, <2 x float> %arg
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
index 2c9042ec17da8..e3d3fdd82992e 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
@@ -21,11 +21,11 @@ define amdgpu_kernel void @fneg_fabs_fadd_f64(ptr addrspace(1) %out, double %x,
; VI-LABEL: fneg_fabs_fadd_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_add_f64 v[0:1], s[0:1], -|v[0:1]|
+; VI-NEXT: v_add_f64 v[0:1], s[2:3], -|v[0:1]|
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -52,13 +52,13 @@ define amdgpu_kernel void @v_fneg_fabs_fadd_f64(ptr addrspace(1) %out, ptr addrs
;
; VI-LABEL: v_fneg_fabs_fadd_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f64 v[0:1], s[2:3], -|s[2:3]|
+; VI-NEXT: v_add_f64 v[0:1], s[0:1], -|s[0:1]|
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
%x = load double, ptr addrspace(1) %xptr, align 8
@@ -89,11 +89,11 @@ define amdgpu_kernel void @fneg_fabs_fmul_f64(ptr addrspace(1) %out, double %x,
; VI-LABEL: fneg_fabs_fmul_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_mul_f64 v[0:1], s[0:1], -|v[0:1]|
+; VI-NEXT: v_mul_f64 v[0:1], s[2:3], -|v[0:1]|
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -122,12 +122,12 @@ define amdgpu_kernel void @fneg_fabs_free_f64(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: fneg_fabs_free_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_or_b32 s0, s3, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_or_b32 s0, s7, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
@@ -155,12 +155,12 @@ define amdgpu_kernel void @fneg_fabs_fn_free_f64(ptr addrspace(1) %out, i64 %in)
;
; VI-LABEL: fneg_fabs_fn_free_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_or_b32 s0, s3, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_or_b32 s0, s7, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
@@ -188,13 +188,13 @@ define amdgpu_kernel void @fneg_fabs_f64(ptr addrspace(1) %out, [8 x i32], doubl
; VI-LABEL: fneg_fabs_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset1_b32 s3, 31
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_or_b32 s0, s3, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
%fabs = call double @llvm.fabs.f64(double %in)
@@ -223,16 +223,16 @@ define amdgpu_kernel void @fneg_fabs_v2f64(ptr addrspace(1) %out, <2 x double> %
; VI-LABEL: fneg_fabs_v2f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_or_b32 s2, s7, 0x80000000
-; VI-NEXT: s_or_b32 s3, s5, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: s_or_b32 s0, s7, 0x80000000
+; VI-NEXT: s_or_b32 s1, s5, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v3, s2
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
%fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in)
@@ -268,27 +268,27 @@ define amdgpu_kernel void @fneg_fabs_v4f64(ptr addrspace(1) %out, <4 x double> %
; VI-LABEL: fneg_fabs_v4f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitset1_b32 s7, 31
; VI-NEXT: s_bitset1_b32 s5, 31
-; VI-NEXT: s_or_b32 s2, s11, 0x80000000
-; VI-NEXT: s_or_b32 s3, s9, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v3, s2
-; VI-NEXT: s_add_u32 s2, s0, 16
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: s_or_b32 s0, s11, 0x80000000
+; VI-NEXT: s_or_b32 s1, s9, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: s_add_u32 s0, s2, 16
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v2, s10
-; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
%fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in)
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
index 32033c52fc998..2a1ca0fa1633f 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
@@ -79,13 +79,13 @@ define amdgpu_kernel void @fneg_fabsf_free_f32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: fneg_fabsf_free_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset1_b32 s2, 31
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_or_b32 s0, s4, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%bc = bitcast i32 %in to float
@@ -141,13 +141,13 @@ define amdgpu_kernel void @fneg_fabsf_f32(ptr addrspace(1) %out, float %in) {
;
; VI-LABEL: fneg_fabsf_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bitset1_b32 s2, 31
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_or_b32 s0, s4, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%fabs = call float @llvm.fabs.f32(float %in)
@@ -177,13 +177,13 @@ define amdgpu_kernel void @v_fneg_fabsf_f32(ptr addrspace(1) %out, ptr addrspace
;
; VI-LABEL: v_fneg_fabsf_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_load_dword v2, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_or_b32_e32 v2, 0x80000000, v2
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -251,18 +251,18 @@ define amdgpu_kernel void @fneg_fabsf_v4f32(ptr addrspace(1) %out, <4 x float> %
; VI-LABEL: fneg_fabsf_v4f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_or_b32 s2, s7, 0x80000000
-; VI-NEXT: s_or_b32 s3, s6, 0x80000000
+; VI-NEXT: s_or_b32 s0, s7, 0x80000000
+; VI-NEXT: s_or_b32 s1, s6, 0x80000000
; VI-NEXT: s_bitset1_b32 s5, 31
; VI-NEXT: s_bitset1_b32 s4, 31
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s3
-; VI-NEXT: v_mov_b32_e32 v3, s2
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
%fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll
index 94fc9293e774c..66b5cadd15d24 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.ll
@@ -19,26 +19,26 @@ define amdgpu_kernel void @s_fneg_f32(ptr addrspace(1) %out, float %in) {
;
; VI-LABEL: s_fneg_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_xor_b32 s2, s2, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_xor_b32 s0, s4, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_fneg_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_xor_b32 s0, s4, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -116,18 +116,18 @@ define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x fl
; VI-LABEL: s_fneg_v4f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_xor_b32 s2, s7, 0x80000000
-; VI-NEXT: s_xor_b32 s3, s6, 0x80000000
+; VI-NEXT: s_xor_b32 s0, s7, 0x80000000
+; VI-NEXT: s_xor_b32 s1, s6, 0x80000000
; VI-NEXT: s_xor_b32 s5, s5, 0x80000000
; VI-NEXT: s_xor_b32 s4, s4, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s3
-; VI-NEXT: v_mov_b32_e32 v3, s2
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -135,17 +135,17 @@ define amdgpu_kernel void @s_fneg_v4f32(ptr addrspace(1) nocapture %out, <4 x fl
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_xor_b32 s2, s7, 0x80000000
-; GFX11-NEXT: s_xor_b32 s3, s6, 0x80000000
+; GFX11-NEXT: s_xor_b32 s0, s7, 0x80000000
+; GFX11-NEXT: s_xor_b32 s1, s6, 0x80000000
; GFX11-NEXT: s_xor_b32 s4, s4, 0x80000000
; GFX11-NEXT: s_xor_b32 s5, s5, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
-; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s2
-; GFX11-NEXT: v_mov_b32_e32 v2, s3
-; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s0
+; GFX11-NEXT: v_mov_b32_e32 v2, s1
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -168,24 +168,24 @@ define amdgpu_kernel void @fsub0_f32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: fsub0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_sub_f32_e64 v2, 0, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_sub_f32_e64 v2, 0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fsub0_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_sub_f32_e64 v1, 0, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_sub_f32_e64 v1, 0, s4
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -210,26 +210,26 @@ define amdgpu_kernel void @fneg_free_f32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: fneg_free_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_xor_b32 s2, s2, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_xor_b32 s0, s4, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fneg_free_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_xor_b32 s0, s4, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -253,24 +253,24 @@ define amdgpu_kernel void @fneg_fold_f32(ptr addrspace(1) %out, float %in) {
;
; VI-LABEL: fneg_fold_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mul_f32_e64 v2, -s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mul_f32_e64 v2, -s4, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fneg_fold_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mul_f32_e64 v1, -s2, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_mul_f32_e64 v1, -s4, s4
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -295,24 +295,24 @@ define amdgpu_kernel void @bitpreserve_fneg_f32(ptr addrspace(1) %out, float %in
;
; VI-LABEL: bitpreserve_fneg_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mul_f32_e64 v2, s2, -4.0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mul_f32_e64 v2, s4, -4.0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: bitpreserve_fneg_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mul_f32_e64 v1, s2, -4.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_mul_f32_e64 v1, s4, -4.0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -339,26 +339,26 @@ define amdgpu_kernel void @s_fneg_i32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: s_fneg_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_xor_b32 s2, s2, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_xor_b32 s0, s4, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_fneg_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000
+; GFX11-NEXT: s_xor_b32 s0, s4, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -391,24 +391,24 @@ define amdgpu_kernel void @s_fneg_i32_fp_use(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: s_fneg_i32_fp_use:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_sub_f32_e64 v2, 2.0, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_sub_f32_e64 v2, 2.0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_fneg_i32_fp_use:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_sub_f32_e64 v1, 2.0, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_sub_f32_e64 v1, 2.0, s4
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -448,25 +448,25 @@ define amdgpu_kernel void @s_fneg_i64(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: s_fneg_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_xor_b32 s0, s3, 0x80000000
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_xor_b32 s0, s7, 0x80000000
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_fneg_i64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000
+; GFX11-NEXT: s_xor_b32 s0, s7, 0x80000000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: v_mov_b32_e32 v0, s6
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -500,21 +500,21 @@ define amdgpu_kernel void @s_fneg_i64_fp_use(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: s_fneg_i64_fp_use:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f64 v[0:1], -s[2:3], 2.0
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_add_f64 v[0:1], -s[6:7], 2.0
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_fneg_i64_fp_use:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_add_f64 v[0:1], -s[2:3], 2.0
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_add_f64 v[0:1], -s[6:7], 2.0
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -563,24 +563,24 @@ define amdgpu_kernel void @s_fneg_i16_fp_use(ptr addrspace(1) %out, i16 %in) {
;
; VI-LABEL: s_fneg_i16_fp_use:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_sub_f16_e64 v2, 2.0, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_sub_f16_e64 v2, 2.0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_fneg_i16_fp_use:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_sub_f16_e64 v1, 2.0, s2
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: v_sub_f16_e64 v1, 2.0, s4
+; GFX11-NEXT: global_store_b16 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -631,31 +631,31 @@ define amdgpu_kernel void @s_fneg_v2i16(ptr addrspace(1) %out, i32 %arg) {
;
; VI-LABEL: s_fneg_v2i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s3, s2, 16
-; VI-NEXT: s_xor_b32 s2, s2, 0x8000
-; VI-NEXT: s_xor_b32 s3, s3, 0x8000
-; VI-NEXT: s_and_b32 s2, s2, 0xffff
-; VI-NEXT: s_lshl_b32 s3, s3, 16
-; VI-NEXT: s_or_b32 s2, s2, s3
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_lshr_b32 s0, s4, 16
+; VI-NEXT: s_xor_b32 s1, s4, 0x8000
+; VI-NEXT: s_xor_b32 s0, s0, 0x8000
+; VI-NEXT: s_and_b32 s1, s1, 0xffff
+; VI-NEXT: s_lshl_b32 s0, s0, 16
+; VI-NEXT: s_or_b32 s0, s1, s0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_fneg_v2i16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_xor_b32 s2, s2, 0x80008000
+; GFX11-NEXT: s_xor_b32 s0, s4, 0x80008000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -714,31 +714,31 @@ define amdgpu_kernel void @s_fneg_v2i16_fp_use(ptr addrspace(1) %out, i32 %arg)
;
; VI-LABEL: s_fneg_v2i16_fp_use:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x4000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s3, s2, 16
-; VI-NEXT: s_xor_b32 s3, s3, 0x8000
-; VI-NEXT: s_xor_b32 s2, s2, 0x8000
-; VI-NEXT: v_mov_b32_e32 v2, s3
-; VI-NEXT: v_add_f16_e64 v1, s2, 2.0
+; VI-NEXT: s_lshr_b32 s0, s4, 16
+; VI-NEXT: s_xor_b32 s0, s0, 0x8000
+; VI-NEXT: s_xor_b32 s1, s4, 0x8000
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_add_f16_e64 v1, s1, 2.0
; VI-NEXT: v_add_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, v1, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: s_fneg_v2i16_fp_use:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_add_f16 v1, s2, 2.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_pk_add_f16 v1, s4, 2.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
index 7f87b41127767..157b7481b506c 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
@@ -37,10 +37,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) {
define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat:
; GFX940: ; %bb.0:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -49,10 +49,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
;
; GFX12-LABEL: flat_atomic_fadd_f32_noret_pat:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_b32 v3, v[0:1]
; GFX12-NEXT: .LBB1_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -76,10 +76,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee:
; GFX940: ; %bb.0:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -88,10 +88,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
;
; GFX12-LABEL: flat_atomic_fadd_f32_noret_pat_ieee:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_b32 v3, v[0:1]
; GFX12-NEXT: .LBB2_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
index ca2fa0f20f0f5..afca4504f5704 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
@@ -22,28 +22,28 @@ define amdgpu_kernel void @test_isinf_pattern(ptr addrspace(1) nocapture %out, f
;
; VI-LABEL: test_isinf_pattern:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x204
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_isinf_pattern:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x204
+; GFX11-NEXT: v_cmp_class_f32_e64 s0, s4, 0x204
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -70,28 +70,28 @@ define amdgpu_kernel void @test_not_isinf_pattern_0(ptr addrspace(1) nocapture %
;
; VI-LABEL: test_not_isinf_pattern_0:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_nlg_f32_e64 s[2:3], |s2|, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_cmp_nlg_f32_e64 s[0:1], |s4|, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_not_isinf_pattern_0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x7f800000, |s2|
+; GFX11-NEXT: v_cmp_nlg_f32_e64 s0, 0x7f800000, |s4|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -115,20 +115,20 @@ define amdgpu_kernel void @test_not_isinf_pattern_1(ptr addrspace(1) nocapture %
;
; VI-LABEL: test_not_isinf_pattern_1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_not_isinf_pattern_1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -155,28 +155,28 @@ define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %o
;
; VI-LABEL: test_isfinite_pattern_0:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x1f8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_isfinite_pattern_0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8
+; GFX11-NEXT: v_cmp_class_f32_e64 s0, s4, 0x1f8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -205,28 +205,28 @@ define amdgpu_kernel void @test_isfinite_pattern_1(ptr addrspace(1) nocapture %o
;
; VI-LABEL: test_isfinite_pattern_1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x1f8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_isfinite_pattern_1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8
+; GFX11-NEXT: v_cmp_class_f32_e64 s0, s4, 0x1f8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -253,27 +253,27 @@ define amdgpu_kernel void @test_isfinite_not_pattern_0(ptr addrspace(1) nocaptur
;
; VI-LABEL: test_isfinite_not_pattern_0:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_cmp_o_f32_e64 s[0:1], s4, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_isfinite_not_pattern_0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_o_f32_e64 s2, s2, s2
+; GFX11-NEXT: v_cmp_o_f32_e64 s0, s4, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -306,31 +306,31 @@ define amdgpu_kernel void @test_isfinite_not_pattern_1(ptr addrspace(1) nocaptur
; VI-LABEL: test_isfinite_not_pattern_1:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s4, s4
+; VI-NEXT: v_cmp_o_f32_e64 s[0:1], s4, s4
; VI-NEXT: v_cmp_neq_f32_e32 vcc, s4, v0
-; VI-NEXT: s_and_b64 s[2:3], s[2:3], vcc
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_and_b64 s[0:1], s[0:1], vcc
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_isfinite_not_pattern_1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_o_f32_e64 s3, s2, s2
-; GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x7f800000, s2
+; GFX11-NEXT: v_cmp_o_f32_e64 s0, s4, s4
+; GFX11-NEXT: v_cmp_neq_f32_e64 s1, 0x7f800000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_b32 s2, s3, s2
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_and_b32 s0, s0, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -417,31 +417,31 @@ define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocaptur
; VI-LABEL: test_isfinite_not_pattern_3:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_u_f32_e64 s[2:3], s4, s4
+; VI-NEXT: v_cmp_u_f32_e64 s[0:1], s4, s4
; VI-NEXT: v_cmp_neq_f32_e64 s[4:5], |s4|, v0
-; VI-NEXT: s_and_b64 s[2:3], s[2:3], s[4:5]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_and_b64 s[0:1], s[0:1], s[4:5]
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_isfinite_not_pattern_3:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_u_f32_e64 s3, s2, s2
-; GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x7f800000, |s2|
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, s4, s4
+; GFX11-NEXT: v_cmp_neq_f32_e64 s1, 0x7f800000, |s4|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_b32 s2, s3, s2
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_and_b32 s0, s0, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -470,28 +470,28 @@ define amdgpu_kernel void @test_isfinite_pattern_4(ptr addrspace(1) nocapture %o
;
; VI-LABEL: test_isfinite_pattern_4:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x1f8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_isfinite_pattern_4:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8
+; GFX11-NEXT: v_cmp_class_f32_e64 s0, s4, 0x1f8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -520,28 +520,28 @@ define amdgpu_kernel void @test_isfinite_pattern_4_commute_and(ptr addrspace(1)
;
; VI-LABEL: test_isfinite_pattern_4_commute_and:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x1f8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_isfinite_pattern_4_commute_and:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8
+; GFX11-NEXT: v_cmp_class_f32_e64 s0, s4, 0x1f8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -592,17 +592,17 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp
; GFX11-LABEL: test_not_isfinite_pattern_4_wrong_ord_test:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x2
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x50
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b32 s5, s[0:1], 0x50
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_o_f32_e64 s3, s2, s3
-; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8
+; GFX11-NEXT: v_cmp_class_f32_e64 s1, s4, 0x1f8
+; GFX11-NEXT: v_cmp_o_f32_e64 s0, s4, s5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_b32 s2, s3, s2
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_and_b32 s0, s0, s1
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -632,28 +632,28 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou
;
; VI-LABEL: test_isinf_pattern_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x204
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_cmp_class_f16_e32 vcc, s4, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_isinf_pattern_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x204
+; GFX11-NEXT: v_cmp_class_f16_e64 s0, s4, 0x204
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -684,28 +684,28 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur
;
; VI-LABEL: test_isfinite_pattern_0_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x1f8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_cmp_class_f16_e32 vcc, s4, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_isfinite_pattern_0_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1f8
+; GFX11-NEXT: v_cmp_class_f16_e64 s0, s4, 0x1f8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -738,28 +738,28 @@ define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocaptur
;
; VI-LABEL: test_isfinite_pattern_4_f16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x1f8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_cmp_class_f16_e32 vcc, s4, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: test_isfinite_pattern_4_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1f8
+; GFX11-NEXT: v_cmp_class_f16_e64 s0, s4, 0x1f8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
index 2928647a9627d..0b49b736ab5dd 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
@@ -52,22 +52,22 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc
; GFX1030-LABEL: raw_buffer_atomic_min_noret_f32:
; GFX1030: ; %bb.0: ; %main_body
; GFX1030-NEXT: s_clause 0x1
-; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen
+; GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen
; GFX1030-NEXT: s_endpgm
;
; GFX1100-LABEL: raw_buffer_atomic_min_noret_f32:
; GFX1100: ; %bb.0: ; %main_body
; GFX1100-NEXT: s_clause 0x1
-; GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX1100-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen
+; GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[4:7], 0 offen
; GFX1100-NEXT: s_nop 0
; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-NEXT: s_endpgm
@@ -75,11 +75,11 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc
; GFX12-LABEL: raw_buffer_atomic_min_noret_f32:
; GFX12: ; %bb.0: ; %main_body
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[4:7], null offen
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -118,22 +118,22 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc
; G_GFX1030-LABEL: raw_buffer_atomic_min_noret_f32:
; G_GFX1030: ; %bb.0: ; %main_body
; G_GFX1030-NEXT: s_clause 0x1
-; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; G_GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen
+; G_GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; G_GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; G_GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen
; G_GFX1030-NEXT: s_endpgm
;
; G_GFX1100-LABEL: raw_buffer_atomic_min_noret_f32:
; G_GFX1100: ; %bb.0: ; %main_body
; G_GFX1100-NEXT: s_clause 0x1
-; G_GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; G_GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; G_GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; G_GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen
+; G_GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[4:7], 0 offen
; G_GFX1100-NEXT: s_nop 0
; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; G_GFX1100-NEXT: s_endpgm
@@ -318,13 +318,13 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre
; GFX12-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
; GFX12: ; %bb.0: ; %main_body
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b96 s[4:6], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b96 s[8:10], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s0, 4
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_mov_b32 s4, 4
-; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_NT_RETURN
-; GFX12-NEXT: v_mov_b32_e32 v1, s6
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[4:7], s0 offen th:TH_ATOMIC_NT_RETURN
+; GFX12-NEXT: v_mov_b32_e32 v1, s10
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: ds_store_b32 v1, v0
; GFX12-NEXT: s_endpgm
@@ -444,22 +444,22 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc
; GFX1030-LABEL: raw_buffer_atomic_max_noret_f32:
; GFX1030: ; %bb.0: ; %main_body
; GFX1030-NEXT: s_clause 0x1
-; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen
+; GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen
; GFX1030-NEXT: s_endpgm
;
; GFX1100-LABEL: raw_buffer_atomic_max_noret_f32:
; GFX1100: ; %bb.0: ; %main_body
; GFX1100-NEXT: s_clause 0x1
-; GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX1100-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen
+; GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[4:7], 0 offen
; GFX1100-NEXT: s_nop 0
; GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-NEXT: s_endpgm
@@ -467,11 +467,11 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc
; GFX12-LABEL: raw_buffer_atomic_max_noret_f32:
; GFX12: ; %bb.0: ; %main_body
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[4:7], null offen
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -510,22 +510,22 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc
; G_GFX1030-LABEL: raw_buffer_atomic_max_noret_f32:
; G_GFX1030: ; %bb.0: ; %main_body
; G_GFX1030-NEXT: s_clause 0x1
-; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; G_GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen
+; G_GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; G_GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; G_GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen
; G_GFX1030-NEXT: s_endpgm
;
; G_GFX1100-LABEL: raw_buffer_atomic_max_noret_f32:
; G_GFX1100: ; %bb.0: ; %main_body
; G_GFX1100-NEXT: s_clause 0x1
-; G_GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; G_GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; G_GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; G_GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen
+; G_GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[4:7], 0 offen
; G_GFX1100-NEXT: s_nop 0
; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; G_GFX1100-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll
index f4745a5acbcd6..c35da12c43e8a 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll
@@ -104,22 +104,22 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f32(ptr addrspace(8)
; G_GFX1030-LABEL: raw_ptr_buffer_atomic_min_noret_f32:
; G_GFX1030: ; %bb.0: ; %main_body
; G_GFX1030-NEXT: s_clause 0x1
-; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; G_GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[0:3], 0 offen
+; G_GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; G_GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; G_GFX1030-NEXT: buffer_atomic_fmin v0, v1, s[4:7], 0 offen
; G_GFX1030-NEXT: s_endpgm
;
; G_GFX1100-LABEL: raw_ptr_buffer_atomic_min_noret_f32:
; G_GFX1100: ; %bb.0: ; %main_body
; G_GFX1100-NEXT: s_clause 0x1
-; G_GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; G_GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; G_GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; G_GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[0:3], 0 offen
+; G_GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; G_GFX1100-NEXT: buffer_atomic_min_f32 v0, v1, s[4:7], 0 offen
; G_GFX1100-NEXT: s_nop 0
; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; G_GFX1100-NEXT: s_endpgm
@@ -462,22 +462,22 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f32(ptr addrspace(8)
; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_noret_f32:
; G_GFX1030: ; %bb.0: ; %main_body
; G_GFX1030-NEXT: s_clause 0x1
-; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; G_GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[0:3], 0 offen
+; G_GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; G_GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; G_GFX1030-NEXT: buffer_atomic_fmax v0, v1, s[4:7], 0 offen
; G_GFX1030-NEXT: s_endpgm
;
; G_GFX1100-LABEL: raw_ptr_buffer_atomic_max_noret_f32:
; G_GFX1100: ; %bb.0: ; %main_body
; G_GFX1100-NEXT: s_clause 0x1
-; G_GFX1100-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; G_GFX1100-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; G_GFX1100-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; G_GFX1100-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; G_GFX1100-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX1100-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen
+; G_GFX1100-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; G_GFX1100-NEXT: buffer_atomic_max_f32 v0, v1, s[4:7], 0 offen
; G_GFX1100-NEXT: s_nop 0
; G_GFX1100-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; G_GFX1100-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index bf3dbec6fa9ef..2663bd047930f 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -1022,22 +1022,22 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret(ptr addrspace(1) %ptr, double %data) {
; GFX90A-LABEL: global_atomic_fadd_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s2
-; GFX90A-NEXT: v_mov_b32_e32 v1, s3
-; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s6
+; GFX90A-NEXT: v_mov_b32_e32 v1, s7
+; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fadd_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v0, s2
-; GFX940-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v0, s6
+; GFX940-NEXT: v_mov_b32_e32 v1, s7
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX940-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1047,22 +1047,22 @@ main_body:
define amdgpu_kernel void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, double %data) {
; GFX90A-LABEL: global_atomic_fmin_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s2
-; GFX90A-NEXT: v_mov_b32_e32 v1, s3
-; GFX90A-NEXT: global_atomic_min_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s6
+; GFX90A-NEXT: v_mov_b32_e32 v1, s7
+; GFX90A-NEXT: global_atomic_min_f64 v2, v[0:1], s[4:5]
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fmin_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v0, s2
-; GFX940-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-NEXT: global_atomic_min_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v0, s6
+; GFX940-NEXT: v_mov_b32_e32 v1, s7
+; GFX940-NEXT: global_atomic_min_f64 v2, v[0:1], s[4:5]
; GFX940-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1072,22 +1072,22 @@ main_body:
define amdgpu_kernel void @global_atomic_fmax_f64_noret(ptr addrspace(1) %ptr, double %data) {
; GFX90A-LABEL: global_atomic_fmax_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s2
-; GFX90A-NEXT: v_mov_b32_e32 v1, s3
-; GFX90A-NEXT: global_atomic_max_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT: v_mov_b32_e32 v0, s6
+; GFX90A-NEXT: v_mov_b32_e32 v1, s7
+; GFX90A-NEXT: global_atomic_max_f64 v2, v[0:1], s[4:5]
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fmax_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v0, s2
-; GFX940-NEXT: v_mov_b32_e32 v1, s3
-; GFX940-NEXT: global_atomic_max_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v0, s6
+; GFX940-NEXT: v_mov_b32_e32 v1, s7
+; GFX940-NEXT: global_atomic_max_f64 v2, v[0:1], s[4:5]
; GFX940-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
@@ -1097,35 +1097,35 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %ptr) #1 {
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_mov_b64 s[2:3], exec
-; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], exec
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX90A-NEXT: s_cbranch_execz .LBB39_3
; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[2:3]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s6
-; GFX90A-NEXT: s_mov_b64 s[2:3], 0
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s4
+; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB39_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX90A-NEXT: s_cbranch_execnz .LBB39_2
; GFX90A-NEXT: .LBB39_3:
; GFX90A-NEXT: s_endpgm
@@ -1139,14 +1139,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX940-NEXT: s_cbranch_execz .LBB39_2
; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: .LBB39_2:
@@ -1166,13 +1166,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_cbranch_execz .LBB40_2
; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: .LBB40_2:
@@ -1187,14 +1187,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX940-NEXT: s_cbranch_execz .LBB40_2
; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: .LBB40_2:
@@ -1207,35 +1207,35 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace(1) %ptr) #1 {
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_system:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_mov_b64 s[2:3], exec
-; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], exec
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX90A-NEXT: s_cbranch_execz .LBB41_3
; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[2:3]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s6
-; GFX90A-NEXT: s_mov_b64 s[2:3], 0
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s4
+; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB41_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX90A-NEXT: s_cbranch_execnz .LBB41_2
; GFX90A-NEXT: .LBB41_3:
; GFX90A-NEXT: s_endpgm
@@ -1249,14 +1249,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX940-NEXT: s_cbranch_execz .LBB41_2
; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5] sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: .LBB41_2:
@@ -1276,13 +1276,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_cbranch_execz .LBB42_2
; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: .LBB42_2:
@@ -1297,14 +1297,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX940-NEXT: s_cbranch_execz .LBB42_2
; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: .LBB42_2:
@@ -1479,33 +1479,33 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrspace(1) %ptr) {
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_mov_b64 s[2:3], exec
-; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], exec
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX90A-NEXT: s_cbranch_execz .LBB49_3
; GFX90A-NEXT: ; %bb.1:
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[2:3]
-; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s6
-; GFX90A-NEXT: s_mov_b64 s[2:3], 0
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s4
+; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0
; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
; GFX90A-NEXT: .LBB49_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX90A-NEXT: s_cbranch_execnz .LBB49_2
; GFX90A-NEXT: .LBB49_3:
; GFX90A-NEXT: s_endpgm
@@ -1519,14 +1519,14 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX940-NEXT: s_cbranch_execz .LBB49_2
; GFX940-NEXT: ; %bb.1:
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[4:5]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: .LBB49_2:
@@ -1564,10 +1564,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
;
; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1581,11 +1581,11 @@ main_body:
define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1]
; GFX90A-NEXT: flat_atomic_add_f64 v[2:3], v[0:1]
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
@@ -1593,10 +1593,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
;
; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_add_f64 v[2:3], v[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1636,10 +1636,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
;
; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_system:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
@@ -1760,23 +1760,23 @@ main_body:
define amdgpu_kernel void @flat_atomic_fadd_f64_noret(ptr %ptr, double %data) {
; GFX90A-LABEL: flat_atomic_fadd_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s0
-; GFX90A-NEXT: v_mov_b32_e32 v1, s1
-; GFX90A-NEXT: v_mov_b32_e32 v2, s2
-; GFX90A-NEXT: v_mov_b32_e32 v3, s3
+; GFX90A-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NEXT: v_mov_b32_e32 v2, s6
+; GFX90A-NEXT: v_mov_b32_e32 v3, s7
; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: flat_atomic_fadd_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v0, s0
-; GFX940-NEXT: v_mov_b32_e32 v1, s1
-; GFX940-NEXT: v_mov_b32_e32 v2, s2
-; GFX940-NEXT: v_mov_b32_e32 v3, s3
+; GFX940-NEXT: v_mov_b32_e32 v0, s4
+; GFX940-NEXT: v_mov_b32_e32 v1, s5
+; GFX940-NEXT: v_mov_b32_e32 v2, s6
+; GFX940-NEXT: v_mov_b32_e32 v3, s7
; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
; GFX940-NEXT: s_endpgm
main_body:
@@ -1829,10 +1829,10 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
;
; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_add_f64 v[2:3], v[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1846,23 +1846,23 @@ main_body:
define amdgpu_kernel void @flat_atomic_fmin_f64_noret(ptr %ptr, double %data) {
; GFX90A-LABEL: flat_atomic_fmin_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s0
-; GFX90A-NEXT: v_mov_b32_e32 v1, s1
-; GFX90A-NEXT: v_mov_b32_e32 v2, s2
-; GFX90A-NEXT: v_mov_b32_e32 v3, s3
+; GFX90A-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NEXT: v_mov_b32_e32 v2, s6
+; GFX90A-NEXT: v_mov_b32_e32 v3, s7
; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3]
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: flat_atomic_fmin_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v0, s0
-; GFX940-NEXT: v_mov_b32_e32 v1, s1
-; GFX940-NEXT: v_mov_b32_e32 v2, s2
-; GFX940-NEXT: v_mov_b32_e32 v3, s3
+; GFX940-NEXT: v_mov_b32_e32 v0, s4
+; GFX940-NEXT: v_mov_b32_e32 v1, s5
+; GFX940-NEXT: v_mov_b32_e32 v2, s6
+; GFX940-NEXT: v_mov_b32_e32 v3, s7
; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3]
; GFX940-NEXT: s_endpgm
main_body:
@@ -1892,23 +1892,23 @@ main_body:
define amdgpu_kernel void @flat_atomic_fmax_f64_noret(ptr %ptr, double %data) {
; GFX90A-LABEL: flat_atomic_fmax_f64_noret:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s0
-; GFX90A-NEXT: v_mov_b32_e32 v1, s1
-; GFX90A-NEXT: v_mov_b32_e32 v2, s2
-; GFX90A-NEXT: v_mov_b32_e32 v3, s3
+; GFX90A-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NEXT: v_mov_b32_e32 v1, s5
+; GFX90A-NEXT: v_mov_b32_e32 v2, s6
+; GFX90A-NEXT: v_mov_b32_e32 v3, s7
; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3]
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: flat_atomic_fmax_f64_noret:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v0, s0
-; GFX940-NEXT: v_mov_b32_e32 v1, s1
-; GFX940-NEXT: v_mov_b32_e32 v2, s2
-; GFX940-NEXT: v_mov_b32_e32 v3, s3
+; GFX940-NEXT: v_mov_b32_e32 v0, s4
+; GFX940-NEXT: v_mov_b32_e32 v1, s5
+; GFX940-NEXT: v_mov_b32_e32 v2, s6
+; GFX940-NEXT: v_mov_b32_e32 v3, s7
; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3]
; GFX940-NEXT: s_endpgm
main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
index a058c1119d4fd..f71045659b7b3 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
@@ -54,14 +54,14 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc
; GFX1030-LABEL: raw_buffer_atomic_min_noret_f64:
; GFX1030: ; %bb.0: ; %main_body
; GFX1030-NEXT: s_clause 0x2
-; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c
-; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030-NEXT: s_load_dword s8, s[0:1], 0x3c
+; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; GFX1030-NEXT: v_mov_b32_e32 v2, s6
-; GFX1030-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen
+; GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; GFX1030-NEXT: v_mov_b32_e32 v2, s8
+; GFX1030-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen
; GFX1030-NEXT: s_endpgm
;
; G_SI-LABEL: raw_buffer_atomic_min_noret_f64:
@@ -104,14 +104,14 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> inreg %rsrc
; G_GFX1030-LABEL: raw_buffer_atomic_min_noret_f64:
; G_GFX1030: ; %bb.0: ; %main_body
; G_GFX1030-NEXT: s_clause 0x2
-; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; G_GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c
-; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX1030-NEXT: s_load_dword s8, s[0:1], 0x3c
+; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; G_GFX1030-NEXT: v_mov_b32_e32 v2, s6
-; G_GFX1030-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen
+; G_GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; G_GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; G_GFX1030-NEXT: v_mov_b32_e32 v2, s8
+; G_GFX1030-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen
; G_GFX1030-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
@@ -291,14 +291,14 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc
; GFX1030-LABEL: raw_buffer_atomic_max_noret_f64:
; GFX1030: ; %bb.0: ; %main_body
; GFX1030-NEXT: s_clause 0x2
-; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c
-; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX1030-NEXT: s_load_dword s8, s[0:1], 0x3c
+; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; GFX1030-NEXT: v_mov_b32_e32 v2, s6
-; GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen
+; GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; GFX1030-NEXT: v_mov_b32_e32 v2, s8
+; GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen
; GFX1030-NEXT: s_endpgm
;
; G_SI-LABEL: raw_buffer_atomic_max_noret_f64:
@@ -341,14 +341,14 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> inreg %rsrc
; G_GFX1030-LABEL: raw_buffer_atomic_max_noret_f64:
; G_GFX1030: ; %bb.0: ; %main_body
; G_GFX1030-NEXT: s_clause 0x2
-; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; G_GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c
-; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX1030-NEXT: s_load_dword s8, s[0:1], 0x3c
+; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; G_GFX1030-NEXT: v_mov_b32_e32 v2, s6
-; G_GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen
+; G_GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; G_GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; G_GFX1030-NEXT: v_mov_b32_e32 v2, s8
+; G_GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen
; G_GFX1030-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll
index 046c92a2fc63f..f308174f051f3 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll
@@ -104,14 +104,14 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8)
; G_GFX1030-LABEL: raw_ptr_buffer_atomic_min_noret_f64:
; G_GFX1030: ; %bb.0: ; %main_body
; G_GFX1030-NEXT: s_clause 0x2
-; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; G_GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c
-; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX1030-NEXT: s_load_dword s8, s[0:1], 0x3c
+; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; G_GFX1030-NEXT: v_mov_b32_e32 v2, s6
-; G_GFX1030-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[0:3], 0 offen
+; G_GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; G_GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; G_GFX1030-NEXT: v_mov_b32_e32 v2, s8
+; G_GFX1030-NEXT: buffer_atomic_fmin_x2 v[0:1], v2, s[4:7], 0 offen
; G_GFX1030-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
@@ -341,14 +341,14 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8)
; G_GFX1030-LABEL: raw_ptr_buffer_atomic_max_noret_f64:
; G_GFX1030: ; %bb.0: ; %main_body
; G_GFX1030-NEXT: s_clause 0x2
-; G_GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; G_GFX1030-NEXT: s_load_dword s6, s[0:1], 0x3c
-; G_GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; G_GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; G_GFX1030-NEXT: s_load_dword s8, s[0:1], 0x3c
+; G_GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; G_GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX1030-NEXT: v_mov_b32_e32 v0, s4
-; G_GFX1030-NEXT: v_mov_b32_e32 v1, s5
-; G_GFX1030-NEXT: v_mov_b32_e32 v2, s6
-; G_GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[0:3], 0 offen
+; G_GFX1030-NEXT: v_mov_b32_e32 v0, s2
+; G_GFX1030-NEXT: v_mov_b32_e32 v1, s3
+; G_GFX1030-NEXT: v_mov_b32_e32 v2, s8
+; G_GFX1030-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[4:7], 0 offen
; G_GFX1030-NEXT: s_endpgm
main_body:
%ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
index b4fee7017d698..facb3e55b6d6e 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
@@ -20,12 +20,12 @@ define amdgpu_kernel void @fp_to_sint_i32(ptr addrspace(1) %out, float %in) {
; VI-LABEL: fp_to_sint_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_i32_f32_e32 v0, s2
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_sint_i32:
@@ -59,12 +59,12 @@ define amdgpu_kernel void @fp_to_sint_i32_fabs(ptr addrspace(1) %out, float %in)
; VI-LABEL: fp_to_sint_i32_fabs:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_i32_f32_e64 v0, |s2|
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_sint_i32_fabs:
@@ -147,17 +147,17 @@ define amdgpu_kernel void @fp_to_sint_v4i32(ptr addrspace(1) %out, ptr addrspace
;
; VI-LABEL: fp_to_sint_v4i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cvt_i32_f32_e32 v3, s7
-; VI-NEXT: v_cvt_i32_f32_e32 v2, s6
-; VI-NEXT: v_cvt_i32_f32_e32 v1, s5
-; VI-NEXT: v_cvt_i32_f32_e32 v0, s4
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: v_cvt_i32_f32_e32 v3, s3
+; VI-NEXT: v_cvt_i32_f32_e32 v2, s2
+; VI-NEXT: v_cvt_i32_f32_e32 v1, s1
+; VI-NEXT: v_cvt_i32_f32_e32 v0, s0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_sint_v4i32:
@@ -217,24 +217,24 @@ define amdgpu_kernel void @fp_to_sint_i64 (ptr addrspace(1) %out, float %in) {
; VI-LABEL: fp_to_sint_i64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s4, 0x2f800000
-; VI-NEXT: s_mov_b32 s5, 0xcf800000
-; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s0, 0x2f800000
+; VI-NEXT: s_mov_b32 s1, 0xcf800000
+; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_trunc_f32_e32 v0, s2
-; VI-NEXT: v_mul_f32_e64 v1, |v0|, s4
+; VI-NEXT: v_mul_f32_e64 v1, |v0|, s0
; VI-NEXT: v_floor_f32_e32 v1, v1
-; VI-NEXT: v_fma_f32 v2, v1, s5, |v0|
+; VI-NEXT: v_fma_f32 v2, v1, s1, |v0|
; VI-NEXT: v_cvt_u32_f32_e32 v2, v2
; VI-NEXT: v_cvt_u32_f32_e32 v1, v1
; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_xor_b32_e32 v0, v2, v3
; VI-NEXT: v_xor_b32_e32 v1, v1, v3
; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v3
; VI-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_sint_i64:
@@ -509,24 +509,24 @@ define amdgpu_kernel void @fp_to_sint_v4i64(ptr addrspace(1) %out, <4 x float> %
;
; VI-LABEL: fp_to_sint_v4i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s8, 0x2f800000
-; VI-NEXT: s_mov_b32 s9, 0xcf800000
-; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s0, 0x2f800000
+; VI-NEXT: s_mov_b32 s1, 0xcf800000
+; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_trunc_f32_e32 v0, s5
-; VI-NEXT: v_mul_f32_e64 v1, |v0|, s8
+; VI-NEXT: v_trunc_f32_e32 v0, s9
+; VI-NEXT: v_mul_f32_e64 v1, |v0|, s0
; VI-NEXT: v_floor_f32_e32 v1, v1
-; VI-NEXT: v_fma_f32 v2, v1, s9, |v0|
+; VI-NEXT: v_fma_f32 v2, v1, s1, |v0|
; VI-NEXT: v_cvt_u32_f32_e32 v2, v2
-; VI-NEXT: v_trunc_f32_e32 v4, s4
+; VI-NEXT: v_trunc_f32_e32 v4, s8
; VI-NEXT: v_cvt_u32_f32_e32 v1, v1
-; VI-NEXT: v_mul_f32_e64 v3, |v4|, s8
+; VI-NEXT: v_mul_f32_e64 v3, |v4|, s0
; VI-NEXT: v_floor_f32_e32 v3, v3
; VI-NEXT: v_ashrrev_i32_e32 v0, 31, v0
; VI-NEXT: v_cvt_u32_f32_e32 v5, v3
-; VI-NEXT: v_fma_f32 v3, v3, s9, |v4|
+; VI-NEXT: v_fma_f32 v3, v3, s1, |v4|
; VI-NEXT: v_xor_b32_e32 v2, v2, v0
; VI-NEXT: v_cvt_u32_f32_e32 v6, v3
; VI-NEXT: v_xor_b32_e32 v1, v1, v0
@@ -534,22 +534,22 @@ define amdgpu_kernel void @fp_to_sint_v4i64(ptr addrspace(1) %out, <4 x float> %
; VI-NEXT: v_subb_u32_e32 v3, vcc, v1, v0, vcc
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v4
; VI-NEXT: v_xor_b32_e32 v4, v5, v1
-; VI-NEXT: v_trunc_f32_e32 v5, s7
+; VI-NEXT: v_trunc_f32_e32 v5, s11
; VI-NEXT: v_xor_b32_e32 v0, v6, v1
-; VI-NEXT: v_mul_f32_e64 v6, |v5|, s8
+; VI-NEXT: v_mul_f32_e64 v6, |v5|, s0
; VI-NEXT: v_floor_f32_e32 v6, v6
; VI-NEXT: v_cvt_u32_f32_e32 v7, v6
-; VI-NEXT: v_fma_f32 v6, v6, s9, |v5|
+; VI-NEXT: v_fma_f32 v6, v6, s1, |v5|
; VI-NEXT: v_cvt_u32_f32_e32 v6, v6
; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v1
; VI-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc
; VI-NEXT: v_ashrrev_i32_e32 v4, 31, v5
-; VI-NEXT: v_trunc_f32_e32 v8, s6
+; VI-NEXT: v_trunc_f32_e32 v8, s10
; VI-NEXT: v_xor_b32_e32 v5, v6, v4
-; VI-NEXT: v_mul_f32_e64 v6, |v8|, s8
+; VI-NEXT: v_mul_f32_e64 v6, |v8|, s0
; VI-NEXT: v_floor_f32_e32 v6, v6
; VI-NEXT: v_cvt_u32_f32_e32 v9, v6
-; VI-NEXT: v_fma_f32 v6, v6, s9, |v8|
+; VI-NEXT: v_fma_f32 v6, v6, s1, |v8|
; VI-NEXT: v_cvt_u32_f32_e32 v10, v6
; VI-NEXT: v_xor_b32_e32 v7, v7, v4
; VI-NEXT: v_sub_u32_e32 v6, vcc, v5, v4
@@ -558,10 +558,10 @@ define amdgpu_kernel void @fp_to_sint_v4i64(ptr addrspace(1) %out, <4 x float> %
; VI-NEXT: v_xor_b32_e32 v4, v10, v5
; VI-NEXT: v_xor_b32_e32 v8, v9, v5
; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v5
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_subb_u32_e32 v5, vcc, v8, v5, vcc
-; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_sint_v4i64:
@@ -749,14 +749,14 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in
;
; VI-LABEL: fp_to_uint_f32_to_i1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_eq_f32_e64 s[4:5], -1.0, s4
-; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT: v_cmp_eq_f32_e64 s[0:1], -1.0, s2
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_f32_to_i1:
@@ -799,14 +799,14 @@ define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, floa
;
; VI-LABEL: fp_to_uint_fabs_f32_to_i1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_eq_f32_e64 s[4:5], -1.0, |s4|
-; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT: v_cmp_eq_f32_e64 s[0:1], -1.0, |s2|
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_fabs_f32_to_i1:
@@ -850,12 +850,12 @@ define amdgpu_kernel void @fp_to_sint_f32_i16(ptr addrspace(1) %out, float %in)
; VI-LABEL: fp_to_sint_f32_i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_i32_f32_e32 v0, s2
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_sint_f32_i16:
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
index f8ede1cd557a7..364e8ca6f4ea0 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
@@ -20,12 +20,12 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i32 (ptr addrspace(1) %out, float %
; VI-LABEL: fp_to_uint_f32_to_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_u32_f32_e32 v0, s2
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_f32_to_i32:
@@ -107,17 +107,17 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i32(ptr addrspace(1) %out, ptr
;
; VI-LABEL: fp_to_uint_v4f32_to_v4i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cvt_u32_f32_e32 v3, s7
-; VI-NEXT: v_cvt_u32_f32_e32 v2, s6
-; VI-NEXT: v_cvt_u32_f32_e32 v1, s5
-; VI-NEXT: v_cvt_u32_f32_e32 v0, s4
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: v_cvt_u32_f32_e32 v3, s3
+; VI-NEXT: v_cvt_u32_f32_e32 v2, s2
+; VI-NEXT: v_cvt_u32_f32_e32 v1, s1
+; VI-NEXT: v_cvt_u32_f32_e32 v0, s0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_v4f32_to_v4i32:
@@ -170,18 +170,18 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i64(ptr addrspace(1) %out, float %x
; VI-LABEL: fp_to_uint_f32_to_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xcf800000
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s0, 0xcf800000
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_trunc_f32_e32 v0, s2
; VI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; VI-NEXT: v_floor_f32_e32 v2, v1
-; VI-NEXT: v_fma_f32 v0, v2, s3, v0
+; VI-NEXT: v_fma_f32 v0, v2, s0, v0
; VI-NEXT: v_cvt_u32_f32_e32 v1, v2
; VI-NEXT: v_cvt_u32_f32_e32 v0, v0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_f32_to_i64:
@@ -412,38 +412,38 @@ define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i64(ptr addrspace(1) %out, <4 x
;
; VI-LABEL: fp_to_uint_v4f32_to_v4i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s2, 0xcf800000
-; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s0, 0xcf800000
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_trunc_f32_e32 v0, s5
-; VI-NEXT: v_trunc_f32_e32 v4, s4
+; VI-NEXT: v_trunc_f32_e32 v0, s9
+; VI-NEXT: v_trunc_f32_e32 v4, s8
; VI-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; VI-NEXT: v_mul_f32_e32 v2, 0x2f800000, v4
; VI-NEXT: v_floor_f32_e32 v5, v1
; VI-NEXT: v_floor_f32_e32 v6, v2
-; VI-NEXT: v_fma_f32 v0, v5, s2, v0
+; VI-NEXT: v_fma_f32 v0, v5, s0, v0
; VI-NEXT: v_cvt_u32_f32_e32 v2, v0
-; VI-NEXT: v_fma_f32 v0, v6, s2, v4
-; VI-NEXT: v_trunc_f32_e32 v4, s7
+; VI-NEXT: v_fma_f32 v0, v6, s0, v4
+; VI-NEXT: v_trunc_f32_e32 v4, s11
; VI-NEXT: v_cvt_u32_f32_e32 v3, v5
; VI-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
-; VI-NEXT: v_trunc_f32_e32 v8, s6
+; VI-NEXT: v_trunc_f32_e32 v8, s10
; VI-NEXT: v_cvt_u32_f32_e32 v1, v6
; VI-NEXT: v_floor_f32_e32 v6, v5
; VI-NEXT: v_mul_f32_e32 v5, 0x2f800000, v8
; VI-NEXT: v_floor_f32_e32 v9, v5
-; VI-NEXT: v_fma_f32 v4, v6, s2, v4
+; VI-NEXT: v_fma_f32 v4, v6, s0, v4
; VI-NEXT: v_cvt_u32_f32_e32 v7, v6
; VI-NEXT: v_cvt_u32_f32_e32 v6, v4
-; VI-NEXT: v_fma_f32 v4, v9, s2, v8
+; VI-NEXT: v_fma_f32 v4, v9, s0, v8
; VI-NEXT: v_cvt_u32_f32_e32 v5, v9
; VI-NEXT: v_cvt_u32_f32_e32 v4, v4
; VI-NEXT: v_cvt_u32_f32_e32 v0, v0
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_v4f32_to_v4i64:
@@ -631,14 +631,14 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i1(ptr addrspace(1) %out, float %in
;
; VI-LABEL: fp_to_uint_f32_to_i1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_eq_f32_e64 s[4:5], 1.0, s4
-; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT: v_cmp_eq_f32_e64 s[0:1], 1.0, s2
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_f32_to_i1:
@@ -681,14 +681,14 @@ define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(ptr addrspace(1) %out, floa
;
; VI-LABEL: fp_to_uint_fabs_f32_to_i1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_eq_f32_e64 s[4:5], 1.0, |s4|
-; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT: v_cmp_eq_f32_e64 s[0:1], 1.0, |s2|
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_fabs_f32_to_i1:
@@ -732,12 +732,12 @@ define amdgpu_kernel void @fp_to_uint_f32_to_i16(ptr addrspace(1) %out, float %i
; VI-LABEL: fp_to_uint_f32_to_i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_u32_f32_e32 v0, s2
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: fp_to_uint_f32_to_i16:
diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
index 82c25c01b1779..2c74b3da8c981 100644
--- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
@@ -25,38 +25,38 @@ define amdgpu_kernel void @fpext_f16_to_f32(
;
; GFX89-LABEL: fpext_f16_to_f32:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fpext_f16_to_f32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -91,41 +91,41 @@ define amdgpu_kernel void @fpext_f16_to_f64(
;
; GFX89-LABEL: fpext_f16_to_f64:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX89-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
-; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fpext_f16_to_f64:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -161,42 +161,42 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f32(
;
; GFX89-LABEL: fpext_v2f16_to_v2f32:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_dword v1, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f32_f16_e32 v0, v1
; GFX89-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fpext_v2f16_to_v2f32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -234,38 +234,38 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f64(
;
; GFX89-LABEL: fpext_v2f16_to_v2f64:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX89-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX89-NEXT: v_cvt_f64_f32_e32 v[0:1], v1
; GFX89-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
-; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fpext_v2f16_to_v2f64:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
@@ -274,7 +274,7 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f64(
; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_cvt_f64_f32_e32 v[2:3], v2
-; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -299,38 +299,27 @@ define amdgpu_kernel void @s_fneg_fpext_f16_to_f32(ptr addrspace(1) %r, i32 %a)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
-; VI-LABEL: s_fneg_fpext_f16_to_f32:
-; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; VI-NEXT: s_endpgm
-;
-; GFX9-LABEL: s_fneg_fpext_f16_to_f32:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cvt_f32_f16_e32 v0, s2
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; GFX9-NEXT: s_endpgm
+; GFX89-LABEL: s_fneg_fpext_f16_to_f32:
+; GFX89: ; %bb.0: ; %entry
+; GFX89-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX89-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s7, 0xf000
+; GFX89-NEXT: s_mov_b32 s6, -1
+; GFX89-NEXT: s_waitcnt lgkmcnt(0)
+; GFX89-NEXT: v_cvt_f32_f16_e32 v0, s2
+; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: s_fneg_fpext_f16_to_f32:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -363,38 +352,38 @@ define amdgpu_kernel void @fneg_fpext_f16_to_f32(
;
; GFX89-LABEL: fneg_fpext_f16_to_f32:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f32_f16_e64 v0, -v0
-; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fneg_fpext_f16_to_f32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e64 v0, -v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -429,38 +418,38 @@ define amdgpu_kernel void @fabs_fpext_f16_to_f32(
;
; GFX89-LABEL: fabs_fpext_f16_to_f32:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fabs_fpext_f16_to_f32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -495,38 +484,38 @@ define amdgpu_kernel void @fneg_fabs_fpext_f16_to_f32(
;
; GFX89-LABEL: fneg_fabs_fpext_f16_to_f32:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f32_f16_e64 v0, -|v0|
-; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fneg_fabs_fpext_f16_to_f32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e64 v0, -|v0|
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -568,45 +557,45 @@ define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32(
;
; GFX89-LABEL: fneg_multi_use_fpext_f16_to_f32:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f32_f16_e64 v1, -v0
; GFX89-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fneg_multi_use_fpext_f16_to_f32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e64 v1, -v0
; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0
-; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-NEXT: buffer_store_b32 v1, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -649,45 +638,45 @@ define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32(
;
; GFX89-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f32_f16_e64 v1, -v0
; GFX89-NEXT: v_mul_f16_e64 v0, -v0, v0
-; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e64 v1, -v0
; GFX11-NEXT: v_mul_f16_e64 v0, -v0, v0
-; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-NEXT: buffer_store_b32 v1, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -729,45 +718,45 @@ define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32(
;
; GFX89-LABEL: fabs_multi_use_fpext_f16_to_f32:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f32_f16_e64 v1, |v0|
; GFX89-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fabs_multi_use_fpext_f16_to_f32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e64 v1, |v0|
; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-NEXT: buffer_store_b32 v1, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -810,45 +799,45 @@ define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32(
;
; GFX89-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f32_f16_e64 v1, |v0|
; GFX89-NEXT: v_mul_f16_e64 v0, |v0|, v0
-; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e64 v1, |v0|
; GFX11-NEXT: v_mul_f16_e64 v0, |v0|, v0
-; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-NEXT: buffer_store_b32 v1, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -890,45 +879,45 @@ define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32(
;
; GFX89-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f32_f16_e64 v1, -|v0|
; GFX89-NEXT: v_or_b32_e32 v0, 0x8000, v0
-; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e64 v1, -|v0|
; GFX11-NEXT: v_or_b32_e32 v0, 0x8000, v0
-; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-NEXT: buffer_store_b32 v1, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -972,45 +961,45 @@ define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32(
;
; GFX89-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f32_f16_e64 v1, -|v0|
; GFX89-NEXT: v_mul_f16_e64 v0, -|v0|, v0
-; GFX89-NEXT: buffer_store_dword v1, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_dword v1, off, s[0:3], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e64 v1, -|v0|
; GFX11-NEXT: v_mul_f16_e64 v0, -|v0|, v0
-; GFX11-NEXT: buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-NEXT: buffer_store_b32 v1, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1031,3 +1020,6 @@ entry:
declare half @llvm.fabs.f16(half) #1
attributes #1 = { nounwind readnone }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX9: {{.*}}
+; VI: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
index 238010ec05e4d..ca5870841cadd 100644
--- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
@@ -25,38 +25,38 @@ define amdgpu_kernel void @fptosi_f16_to_i16(
;
; VI-LABEL: fptosi_f16_to_i16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_i16_f16_e32 v0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptosi_f16_to_i16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_i16_f16_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -91,41 +91,41 @@ define amdgpu_kernel void @fptosi_f16_to_i32(
;
; VI-LABEL: fptosi_f16_to_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
; VI-NEXT: v_cvt_i32_f32_e32 v0, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptosi_f16_to_i32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -164,43 +164,43 @@ define amdgpu_kernel void @fptosi_f16_to_i64(
;
; VI-LABEL: fptosi_f16_to_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
; VI-NEXT: v_cvt_i32_f32_e32 v0, v0
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptosi_f16_to_i64:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -241,37 +241,37 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i16(
;
; VI-LABEL: fptosi_v2f16_to_v2i16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_i16_f16_e32 v1, v0
; VI-NEXT: v_cvt_i16_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptosi_v2f16_to_v2i16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_i16_f16_e32 v0, v0
@@ -280,7 +280,7 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i16(
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -318,38 +318,38 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i32(
;
; VI-LABEL: fptosi_v2f16_to_v2i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v1, v0
; VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: v_cvt_i32_f32_e32 v0, v1
; VI-NEXT: v_cvt_i32_f32_e32 v1, v2
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptosi_v2f16_to_v2i32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
@@ -358,7 +358,7 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i32(
; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v1
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -401,17 +401,17 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i64(
;
; VI-LABEL: fptosi_v2f16_to_v2i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v1, v0
; VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -419,22 +419,22 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i64(
; VI-NEXT: v_cvt_i32_f32_e32 v2, v2
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptosi_v2f16_to_v2i64:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
@@ -446,7 +446,7 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i64(
; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -475,28 +475,28 @@ define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) {
;
; VI-LABEL: fptosi_f16_to_i1:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_eq_f16_e64 s[4:5], -1.0, s4
-; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT: v_cmp_eq_f16_e64 s[0:1], -1.0, s2
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptosi_f16_to_i1:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_eq_f16_e64 s2, -1.0, s2
+; GFX11-NEXT: v_cmp_eq_f16_e64 s0, -1.0, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT: buffer_store_b8 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
index 1116dc9ae2e5b..2d5ae03926dd5 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
@@ -25,38 +25,38 @@ define amdgpu_kernel void @fptoui_f16_to_i16(
;
; VI-LABEL: fptoui_f16_to_i16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_u16_f16_e32 v0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptoui_f16_to_i16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_u16_f16_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -91,41 +91,41 @@ define amdgpu_kernel void @fptoui_f16_to_i32(
;
; VI-LABEL: fptoui_f16_to_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
; VI-NEXT: v_cvt_u32_f32_e32 v0, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptoui_f16_to_i32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -164,43 +164,43 @@ define amdgpu_kernel void @fptoui_f16_to_i64(
;
; VI-LABEL: fptoui_f16_to_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
; VI-NEXT: v_cvt_u32_f32_e32 v0, v0
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptoui_f16_to_i64:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -240,37 +240,37 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i16(
;
; VI-LABEL: fptoui_v2f16_to_v2i16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_u16_f16_e32 v1, v0
; VI-NEXT: v_cvt_u16_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptoui_v2f16_to_v2i16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_u16_f16_e32 v0, v0
@@ -279,7 +279,7 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i16(
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -317,38 +317,38 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i32(
;
; VI-LABEL: fptoui_v2f16_to_v2i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v1, v0
; VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: v_cvt_u32_f32_e32 v0, v1
; VI-NEXT: v_cvt_u32_f32_e32 v1, v2
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptoui_v2f16_to_v2i32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
@@ -357,7 +357,7 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i32(
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -400,17 +400,17 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i64(
;
; VI-LABEL: fptoui_v2f16_to_v2i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v1, v0
; VI-NEXT: v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -418,22 +418,22 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i64(
; VI-NEXT: v_cvt_u32_f32_e32 v2, v2
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptoui_v2f16_to_v2i64:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0
@@ -444,7 +444,7 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i64(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_cvt_u32_f32_e32 v2, v2
; GFX11-NEXT: v_mov_b32_e32 v3, v1
-; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -473,28 +473,28 @@ define amdgpu_kernel void @fptoui_f16_to_i1(ptr addrspace(1) %out, half %in) {
;
; VI-LABEL: fptoui_f16_to_i1:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_eq_f16_e64 s[4:5], 1.0, s4
-; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT: v_cmp_eq_f16_e64 s[0:1], 1.0, s2
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: fptoui_f16_to_i1:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_eq_f16_e64 s2, 1.0, s2
+; GFX11-NEXT: v_cmp_eq_f16_e64 s0, 1.0, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT: buffer_store_b8 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
index 6cc7368eeae61..38730363d3b93 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -41,94 +41,94 @@ define amdgpu_kernel void @fptrunc_f32_to_f16(
;
; VI-SDAG-LABEL: fptrunc_f32_to_f16:
; VI-SDAG: ; %bb.0: ; %entry
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s6, -1
-; VI-SDAG-NEXT: s_mov_b32 s10, s6
-; VI-SDAG-NEXT: s_mov_b32 s11, s7
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_mov_b32 s10, s2
+; VI-SDAG-NEXT: s_mov_b32 s11, s3
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_mov_b32 s8, s2
-; VI-SDAG-NEXT: s_mov_b32 s9, s3
+; VI-SDAG-NEXT: s_mov_b32 s8, s6
+; VI-SDAG-NEXT: s_mov_b32 s9, s7
; VI-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-SDAG-NEXT: s_mov_b32 s4, s0
-; VI-SDAG-NEXT: s_mov_b32 s5, s1
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fptrunc_f32_to_f16:
; VI-GISEL: ; %bb.0: ; %entry
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
+; VI-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX9-SDAG-LABEL: fptrunc_f32_to_f16:
; GFX9-SDAG: ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s11, s3
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT: s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT: s_mov_b32 s9, s7
; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT: s_mov_b32 s1, s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: fptrunc_f32_to_f16:
; GFX9-GISEL: ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
-; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
+; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fptrunc_f32_to_f16:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX11-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s11, s3
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX11-SDAG-NEXT: s_mov_b32 s9, s3
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX11-SDAG-NEXT: s_mov_b32 s9, s7
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
; GFX11-SDAG-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fptrunc_f32_to_f16:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
+; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -176,102 +176,102 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
;
; VI-SDAG-LABEL: fptrunc_f64_to_f16:
; VI-SDAG: ; %bb.0: ; %entry
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s6, -1
-; VI-SDAG-NEXT: s_mov_b32 s10, s6
-; VI-SDAG-NEXT: s_mov_b32 s11, s7
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_mov_b32 s10, s2
+; VI-SDAG-NEXT: s_mov_b32 s11, s3
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_mov_b32 s8, s2
-; VI-SDAG-NEXT: s_mov_b32 s9, s3
+; VI-SDAG-NEXT: s_mov_b32 s8, s6
+; VI-SDAG-NEXT: s_mov_b32 s9, s7
; VI-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; VI-SDAG-NEXT: s_mov_b32 s4, s0
-; VI-SDAG-NEXT: s_mov_b32 s5, s1
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; VI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fptrunc_f64_to_f16:
; VI-GISEL: ; %bb.0: ; %entry
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX9-SDAG-LABEL: fptrunc_f64_to_f16:
; GFX9-SDAG: ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s11, s3
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT: s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT: s_mov_b32 s9, s7
; GFX9-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT: s_mov_b32 s1, s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: fptrunc_f64_to_f16:
; GFX9-GISEL: ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fptrunc_f64_to_f16:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX11-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s11, s3
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX11-SDAG-NEXT: s_mov_b32 s9, s3
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX11-SDAG-NEXT: s_mov_b32 s9, s7
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
; GFX11-SDAG-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fptrunc_f64_to_f16:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -323,109 +323,109 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16(
;
; VI-SDAG-LABEL: fptrunc_v2f32_to_v2f16:
; VI-SDAG: ; %bb.0: ; %entry
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s6, -1
-; VI-SDAG-NEXT: s_mov_b32 s10, s6
-; VI-SDAG-NEXT: s_mov_b32 s11, s7
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_mov_b32 s10, s2
+; VI-SDAG-NEXT: s_mov_b32 s11, s3
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_mov_b32 s8, s2
-; VI-SDAG-NEXT: s_mov_b32 s9, s3
+; VI-SDAG-NEXT: s_mov_b32 s8, s6
+; VI-SDAG-NEXT: s_mov_b32 s9, s7
; VI-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; VI-SDAG-NEXT: s_mov_b32 s4, s0
-; VI-SDAG-NEXT: s_mov_b32 s5, s1
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fptrunc_v2f32_to_v2f16:
; VI-GISEL: ; %bb.0: ; %entry
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
; VI-GISEL-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
; VI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX9-SDAG-LABEL: fptrunc_v2f32_to_v2f16:
; GFX9-SDAG: ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s11, s3
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT: s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT: s_mov_b32 s9, s7
; GFX9-SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT: s_mov_b32 s1, s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: fptrunc_v2f32_to_v2f16:
; GFX9-GISEL: ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
-; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v1, s3
-; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
+; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v1, s1
; GFX9-GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fptrunc_v2f32_to_v2f16:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX11-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s11, s3
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX11-SDAG-NEXT: s_mov_b32 s9, s3
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX11-SDAG-NEXT: s_mov_b32 s9, s7
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
; GFX11-SDAG-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX11-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fptrunc_v2f32_to_v2f16:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
-; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v1, s3
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
+; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v1, s1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -481,93 +481,93 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
;
; VI-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
; VI-SDAG: ; %bb.0: ; %entry
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s6, -1
-; VI-SDAG-NEXT: s_mov_b32 s10, s6
-; VI-SDAG-NEXT: s_mov_b32 s11, s7
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_mov_b32 s10, s2
+; VI-SDAG-NEXT: s_mov_b32 s11, s3
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_mov_b32 s8, s2
-; VI-SDAG-NEXT: s_mov_b32 s9, s3
+; VI-SDAG-NEXT: s_mov_b32 s8, s6
+; VI-SDAG-NEXT: s_mov_b32 s9, s7
; VI-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; VI-SDAG-NEXT: s_mov_b32 s4, s0
-; VI-SDAG-NEXT: s_mov_b32 s5, s1
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, v[2:3]
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; VI-SDAG-NEXT: v_cvt_f16_f32_sdwa v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fptrunc_v2f64_to_v2f16:
; VI-GISEL: ; %bb.0: ; %entry
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[2:3]
; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-GISEL-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX9-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
; GFX9-SDAG: ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s11, s3
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT: s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT: s_mov_b32 s9, s7
; GFX9-SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT: s_mov_b32 s1, s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_cvt_f32_f64_e32 v2, v[2:3]
; GFX9-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v2
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-SDAG-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: fptrunc_v2f64_to_v2f16:
; GFX9-GISEL: ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX9-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
+; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
+; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[2:3]
; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fptrunc_v2f64_to_v2f16:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX11-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s11, s3
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX11-SDAG-NEXT: s_mov_b32 s9, s3
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX11-SDAG-NEXT: s_mov_b32 s9, s7
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
; GFX11-SDAG-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, v[2:3]
@@ -577,27 +577,27 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-SDAG-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fptrunc_v2f64_to_v2f16:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[0:1]
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[2:3]
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -643,94 +643,94 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16(
;
; VI-SDAG-LABEL: fneg_fptrunc_f32_to_f16:
; VI-SDAG: ; %bb.0: ; %entry
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s6, -1
-; VI-SDAG-NEXT: s_mov_b32 s10, s6
-; VI-SDAG-NEXT: s_mov_b32 s11, s7
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_mov_b32 s10, s2
+; VI-SDAG-NEXT: s_mov_b32 s11, s3
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_mov_b32 s8, s2
-; VI-SDAG-NEXT: s_mov_b32 s9, s3
+; VI-SDAG-NEXT: s_mov_b32 s8, s6
+; VI-SDAG-NEXT: s_mov_b32 s9, s7
; VI-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-SDAG-NEXT: s_mov_b32 s4, s0
-; VI-SDAG-NEXT: s_mov_b32 s5, s1
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; VI-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fneg_fptrunc_f32_to_f16:
; VI-GISEL: ; %bb.0: ; %entry
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -s2
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -s0
+; VI-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX9-SDAG-LABEL: fneg_fptrunc_f32_to_f16:
; GFX9-SDAG: ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s11, s3
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT: s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT: s_mov_b32 s9, s7
; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT: s_mov_b32 s1, s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: fneg_fptrunc_f32_to_f16:
; GFX9-GISEL: ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -s2
-; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -s0
+; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fneg_fptrunc_f32_to_f16:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX11-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s11, s3
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX11-SDAG-NEXT: s_mov_b32 s9, s3
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX11-SDAG-NEXT: s_mov_b32 s9, s7
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
; GFX11-SDAG-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fneg_fptrunc_f32_to_f16:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -s2
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -s0
+; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -777,94 +777,94 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16(
;
; VI-SDAG-LABEL: fabs_fptrunc_f32_to_f16:
; VI-SDAG: ; %bb.0: ; %entry
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s6, -1
-; VI-SDAG-NEXT: s_mov_b32 s10, s6
-; VI-SDAG-NEXT: s_mov_b32 s11, s7
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_mov_b32 s10, s2
+; VI-SDAG-NEXT: s_mov_b32 s11, s3
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_mov_b32 s8, s2
-; VI-SDAG-NEXT: s_mov_b32 s9, s3
+; VI-SDAG-NEXT: s_mov_b32 s8, s6
+; VI-SDAG-NEXT: s_mov_b32 s9, s7
; VI-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-SDAG-NEXT: s_mov_b32 s4, s0
-; VI-SDAG-NEXT: s_mov_b32 s5, s1
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_cvt_f16_f32_e64 v0, |v0|
-; VI-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fabs_fptrunc_f32_to_f16:
; VI-GISEL: ; %bb.0: ; %entry
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s2|
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s0|
+; VI-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX9-SDAG-LABEL: fabs_fptrunc_f32_to_f16:
; GFX9-SDAG: ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s11, s3
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT: s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT: s_mov_b32 s9, s7
; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT: s_mov_b32 s1, s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e64 v0, |v0|
-; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: fabs_fptrunc_f32_to_f16:
; GFX9-GISEL: ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s2|
-; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s0|
+; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fabs_fptrunc_f32_to_f16:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX11-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s11, s3
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX11-SDAG-NEXT: s_mov_b32 s9, s3
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX11-SDAG-NEXT: s_mov_b32 s9, s7
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
; GFX11-SDAG-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v0, |v0|
-; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fabs_fptrunc_f32_to_f16:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s2|
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s0|
+; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -911,94 +911,94 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16(
;
; VI-SDAG-LABEL: fneg_fabs_fptrunc_f32_to_f16:
; VI-SDAG: ; %bb.0: ; %entry
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s6, -1
-; VI-SDAG-NEXT: s_mov_b32 s10, s6
-; VI-SDAG-NEXT: s_mov_b32 s11, s7
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_mov_b32 s10, s2
+; VI-SDAG-NEXT: s_mov_b32 s11, s3
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_mov_b32 s8, s2
-; VI-SDAG-NEXT: s_mov_b32 s9, s3
+; VI-SDAG-NEXT: s_mov_b32 s8, s6
+; VI-SDAG-NEXT: s_mov_b32 s9, s7
; VI-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-SDAG-NEXT: s_mov_b32 s4, s0
-; VI-SDAG-NEXT: s_mov_b32 s5, s1
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -|v0|
-; VI-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fneg_fabs_fptrunc_f32_to_f16:
; VI-GISEL: ; %bb.0: ; %entry
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -|s2|
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -|s0|
+; VI-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX9-SDAG-LABEL: fneg_fabs_fptrunc_f32_to_f16:
; GFX9-SDAG: ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s11, s3
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT: s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT: s_mov_b32 s9, s7
; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT: s_mov_b32 s1, s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -|v0|
-; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: fneg_fabs_fptrunc_f32_to_f16:
; GFX9-GISEL: ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -|s2|
-; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -|s0|
+; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fneg_fabs_fptrunc_f32_to_f16:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX11-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s11, s3
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX11-SDAG-NEXT: s_mov_b32 s9, s3
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX11-SDAG-NEXT: s_mov_b32 s9, s7
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
; GFX11-SDAG-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -|v0|
-; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fneg_fabs_fptrunc_f32_to_f16:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -|s2|
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: v_cvt_f16_f32_e64 v0, -|s0|
+; GFX11-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -1046,98 +1046,98 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32(
;
; VI-SDAG-LABEL: fptrunc_f32_to_f16_zext_i32:
; VI-SDAG: ; %bb.0: ; %entry
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s6, -1
-; VI-SDAG-NEXT: s_mov_b32 s10, s6
-; VI-SDAG-NEXT: s_mov_b32 s11, s7
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_mov_b32 s10, s2
+; VI-SDAG-NEXT: s_mov_b32 s11, s3
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_mov_b32 s8, s2
-; VI-SDAG-NEXT: s_mov_b32 s9, s3
+; VI-SDAG-NEXT: s_mov_b32 s8, s6
+; VI-SDAG-NEXT: s_mov_b32 s9, s7
; VI-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-SDAG-NEXT: s_mov_b32 s4, s0
-; VI-SDAG-NEXT: s_mov_b32 s5, s1
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fptrunc_f32_to_f16_zext_i32:
; VI-GISEL: ; %bb.0: ; %entry
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
+; VI-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX9-SDAG-LABEL: fptrunc_f32_to_f16_zext_i32:
; GFX9-SDAG: ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s11, s3
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT: s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT: s_mov_b32 s9, s7
; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT: s_mov_b32 s1, s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: fptrunc_f32_to_f16_zext_i32:
; GFX9-GISEL: ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
-; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
+; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fptrunc_f32_to_f16_zext_i32:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX11-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s11, s3
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX11-SDAG-NEXT: s_mov_b32 s9, s3
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX11-SDAG-NEXT: s_mov_b32 s9, s7
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
; GFX11-SDAG-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fptrunc_f32_to_f16_zext_i32:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -1185,98 +1185,98 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32(
;
; VI-SDAG-LABEL: fptrunc_fabs_f32_to_f16_zext_i32:
; VI-SDAG: ; %bb.0: ; %entry
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s6, -1
-; VI-SDAG-NEXT: s_mov_b32 s10, s6
-; VI-SDAG-NEXT: s_mov_b32 s11, s7
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_mov_b32 s10, s2
+; VI-SDAG-NEXT: s_mov_b32 s11, s3
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_mov_b32 s8, s2
-; VI-SDAG-NEXT: s_mov_b32 s9, s3
+; VI-SDAG-NEXT: s_mov_b32 s8, s6
+; VI-SDAG-NEXT: s_mov_b32 s9, s7
; VI-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-SDAG-NEXT: s_mov_b32 s4, s0
-; VI-SDAG-NEXT: s_mov_b32 s5, s1
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_cvt_f16_f32_e64 v0, |v0|
-; VI-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fptrunc_fabs_f32_to_f16_zext_i32:
; VI-GISEL: ; %bb.0: ; %entry
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s2|
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s0|
+; VI-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX9-SDAG-LABEL: fptrunc_fabs_f32_to_f16_zext_i32:
; GFX9-SDAG: ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s11, s3
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT: s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT: s_mov_b32 s9, s7
; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT: s_mov_b32 s1, s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e64 v0, |v0|
-; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: fptrunc_fabs_f32_to_f16_zext_i32:
; GFX9-GISEL: ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s2|
-; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s0|
+; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fptrunc_fabs_f32_to_f16_zext_i32:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX11-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s11, s3
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX11-SDAG-NEXT: s_mov_b32 s9, s3
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX11-SDAG-NEXT: s_mov_b32 s9, s7
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
; GFX11-SDAG-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f16_f32_e64 v0, |v0|
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fptrunc_fabs_f32_to_f16_zext_i32:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s2|
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT: v_cvt_f16_f32_e64 v0, |s0|
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -1327,102 +1327,102 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32(
;
; VI-SDAG-LABEL: fptrunc_f32_to_f16_sext_i32:
; VI-SDAG: ; %bb.0: ; %entry
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s6, -1
-; VI-SDAG-NEXT: s_mov_b32 s10, s6
-; VI-SDAG-NEXT: s_mov_b32 s11, s7
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_mov_b32 s10, s2
+; VI-SDAG-NEXT: s_mov_b32 s11, s3
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_mov_b32 s8, s2
-; VI-SDAG-NEXT: s_mov_b32 s9, s3
+; VI-SDAG-NEXT: s_mov_b32 s8, s6
+; VI-SDAG-NEXT: s_mov_b32 s9, s7
; VI-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-SDAG-NEXT: s_mov_b32 s4, s0
-; VI-SDAG-NEXT: s_mov_b32 s5, s1
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16
-; VI-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fptrunc_f32_to_f16_sext_i32:
; VI-GISEL: ; %bb.0: ; %entry
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-GISEL-NEXT: s_mov_b32 s7, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
+; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
; VI-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16
-; VI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX9-SDAG-LABEL: fptrunc_f32_to_f16_sext_i32:
; GFX9-SDAG: ; %bb.0: ; %entry
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX9-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX9-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX9-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX9-SDAG-NEXT: s_mov_b32 s11, s3
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX9-SDAG-NEXT: s_mov_b32 s9, s3
+; GFX9-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX9-SDAG-NEXT: s_mov_b32 s9, s7
; GFX9-SDAG-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-SDAG-NEXT: s_mov_b32 s4, s0
-; GFX9-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX9-SDAG-NEXT: s_mov_b32 s0, s4
+; GFX9-SDAG-NEXT: s_mov_b32 s1, s5
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: fptrunc_f32_to_f16_sext_i32:
; GFX9-GISEL: ; %bb.0: ; %entry
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-GISEL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX9-GISEL-NEXT: s_mov_b32 s7, 0xf000
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
-; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
; GFX9-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fptrunc_f32_to_f16_sext_i32:
; GFX11-SDAG: ; %bb.0: ; %entry
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
-; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s10, s6
-; GFX11-SDAG-NEXT: s_mov_b32 s11, s7
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s10, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s11, s3
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_mov_b32 s8, s2
-; GFX11-SDAG-NEXT: s_mov_b32 s9, s3
-; GFX11-SDAG-NEXT: s_mov_b32 s4, s0
+; GFX11-SDAG-NEXT: s_mov_b32 s8, s6
+; GFX11-SDAG-NEXT: s_mov_b32 s9, s7
+; GFX11-SDAG-NEXT: s_mov_b32 s0, s4
; GFX11-SDAG-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-SDAG-NEXT: s_mov_b32 s5, s1
+; GFX11-SDAG-NEXT: s_mov_b32 s1, s5
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fptrunc_f32_to_f16_sext_i32:
; GFX11-GISEL: ; %bb.0: ; %entry
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-GISEL-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
+; GFX11-GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index e4aa4d1d3ddb5..bcef7bc7b46e0 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -28,66 +28,66 @@ define amdgpu_kernel void @fptrunc_f64_to_f32(ptr addrspace(1) %out, double %in)
;
; VI-SDAG-LABEL: fptrunc_f64_to_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s7, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s6, -1
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s2, -1
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; VI-SDAG-NEXT: s_mov_b32 s4, s0
-; VI-SDAG-NEXT: s_mov_b32 s5, s1
-; VI-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[6:7]
+; VI-SDAG-NEXT: s_mov_b32 s0, s4
+; VI-SDAG-NEXT: s_mov_b32 s1, s5
+; VI-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fptrunc_f64_to_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
-; VI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[6:7]
+; VI-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-GISEL-NEXT: s_mov_b32 s7, 0xf000
+; VI-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: fptrunc_f64_to_f32:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-SDAG-NEXT: s_mov_b32 s2, -1
-; GFX10-SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[6:7]
+; GFX10-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX10-SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-SDAG-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: fptrunc_f64_to_f32:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; GFX10-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[6:7]
+; GFX10-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX10-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fptrunc_f64_to_f32:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
-; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[6:7]
+; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-SDAG-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fptrunc_f64_to_f32:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[6:7]
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-GISEL-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -218,358 +218,356 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
;
; VI-SAFE-GISEL-LABEL: fptrunc_f64_to_f16:
; VI-SAFE-GISEL: ; %bb.0:
-; VI-SAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SAFE-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SAFE-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
-; VI-SAFE-GISEL-NEXT: s_lshr_b32 s5, s3, 8
-; VI-SAFE-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
-; VI-SAFE-GISEL-NEXT: s_addk_i32 s4, 0xfc10
-; VI-SAFE-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
-; VI-SAFE-GISEL-NEXT: s_or_b32 s2, s6, s2
+; VI-SAFE-GISEL-NEXT: s_bfe_u32 s0, s7, 0xb0014
+; VI-SAFE-GISEL-NEXT: s_lshr_b32 s1, s7, 8
+; VI-SAFE-GISEL-NEXT: s_and_b32 s2, s7, 0x1ff
+; VI-SAFE-GISEL-NEXT: s_addk_i32 s0, 0xfc10
+; VI-SAFE-GISEL-NEXT: s_and_b32 s1, s1, 0xffe
+; VI-SAFE-GISEL-NEXT: s_or_b32 s2, s2, s6
; VI-SAFE-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; VI-SAFE-GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; VI-SAFE-GISEL-NEXT: s_or_b32 s2, s5, s2
-; VI-SAFE-GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; VI-SAFE-GISEL-NEXT: s_cselect_b32 s5, 1, 0
-; VI-SAFE-GISEL-NEXT: s_sub_i32 s7, 1, s4
-; VI-SAFE-GISEL-NEXT: s_lshl_b32 s6, s4, 12
-; VI-SAFE-GISEL-NEXT: s_max_i32 s7, s7, 0
-; VI-SAFE-GISEL-NEXT: s_or_b32 s6, s2, s6
-; VI-SAFE-GISEL-NEXT: s_min_i32 s7, s7, 13
-; VI-SAFE-GISEL-NEXT: s_bitset1_b32 s2, 12
-; VI-SAFE-GISEL-NEXT: s_lshl_b32 s5, s5, 9
-; VI-SAFE-GISEL-NEXT: s_lshr_b32 s8, s2, s7
-; VI-SAFE-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00
-; VI-SAFE-GISEL-NEXT: s_lshl_b32 s7, s8, s7
-; VI-SAFE-GISEL-NEXT: s_cmp_lg_u32 s7, s2
+; VI-SAFE-GISEL-NEXT: s_or_b32 s1, s1, s2
+; VI-SAFE-GISEL-NEXT: s_cmp_lg_u32 s1, 0
; VI-SAFE-GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; VI-SAFE-GISEL-NEXT: s_or_b32 s2, s8, s2
-; VI-SAFE-GISEL-NEXT: s_cmp_lt_i32 s4, 1
-; VI-SAFE-GISEL-NEXT: s_cselect_b32 s2, s2, s6
-; VI-SAFE-GISEL-NEXT: s_and_b32 s6, s2, 7
-; VI-SAFE-GISEL-NEXT: s_lshr_b32 s2, s2, 2
-; VI-SAFE-GISEL-NEXT: s_cmp_eq_u32 s6, 3
-; VI-SAFE-GISEL-NEXT: s_cselect_b32 s7, 1, 0
-; VI-SAFE-GISEL-NEXT: s_cmp_gt_i32 s6, 5
+; VI-SAFE-GISEL-NEXT: s_sub_i32 s6, 1, s0
+; VI-SAFE-GISEL-NEXT: s_lshl_b32 s3, s0, 12
+; VI-SAFE-GISEL-NEXT: s_max_i32 s6, s6, 0
+; VI-SAFE-GISEL-NEXT: s_or_b32 s3, s1, s3
+; VI-SAFE-GISEL-NEXT: s_min_i32 s6, s6, 13
+; VI-SAFE-GISEL-NEXT: s_bitset1_b32 s1, 12
+; VI-SAFE-GISEL-NEXT: s_lshl_b32 s2, s2, 9
+; VI-SAFE-GISEL-NEXT: s_lshr_b32 s8, s1, s6
+; VI-SAFE-GISEL-NEXT: s_or_b32 s2, s2, 0x7c00
+; VI-SAFE-GISEL-NEXT: s_lshl_b32 s6, s8, s6
+; VI-SAFE-GISEL-NEXT: s_cmp_lg_u32 s6, s1
+; VI-SAFE-GISEL-NEXT: s_cselect_b32 s1, 1, 0
+; VI-SAFE-GISEL-NEXT: s_or_b32 s1, s8, s1
+; VI-SAFE-GISEL-NEXT: s_cmp_lt_i32 s0, 1
+; VI-SAFE-GISEL-NEXT: s_cselect_b32 s1, s1, s3
+; VI-SAFE-GISEL-NEXT: s_and_b32 s3, s1, 7
+; VI-SAFE-GISEL-NEXT: s_lshr_b32 s1, s1, 2
+; VI-SAFE-GISEL-NEXT: s_cmp_eq_u32 s3, 3
; VI-SAFE-GISEL-NEXT: s_cselect_b32 s6, 1, 0
-; VI-SAFE-GISEL-NEXT: s_or_b32 s6, s7, s6
-; VI-SAFE-GISEL-NEXT: s_and_b32 s6, s6, 1
-; VI-SAFE-GISEL-NEXT: s_add_i32 s2, s2, s6
-; VI-SAFE-GISEL-NEXT: s_cmp_gt_i32 s4, 30
-; VI-SAFE-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2
-; VI-SAFE-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f
-; VI-SAFE-GISEL-NEXT: s_cselect_b32 s2, s5, s2
-; VI-SAFE-GISEL-NEXT: s_lshr_b32 s3, s3, 16
-; VI-SAFE-GISEL-NEXT: s_and_b32 s3, s3, 0x8000
-; VI-SAFE-GISEL-NEXT: s_or_b32 s2, s3, s2
-; VI-SAFE-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-SAFE-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-SAFE-GISEL-NEXT: s_mov_b32 s3, 0xf000
-; VI-SAFE-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-SAFE-GISEL-NEXT: s_cmp_gt_i32 s3, 5
+; VI-SAFE-GISEL-NEXT: s_cselect_b32 s3, 1, 0
+; VI-SAFE-GISEL-NEXT: s_or_b32 s3, s6, s3
+; VI-SAFE-GISEL-NEXT: s_and_b32 s3, s3, 1
+; VI-SAFE-GISEL-NEXT: s_add_i32 s1, s1, s3
+; VI-SAFE-GISEL-NEXT: s_cmp_gt_i32 s0, 30
+; VI-SAFE-GISEL-NEXT: s_cselect_b32 s1, 0x7c00, s1
+; VI-SAFE-GISEL-NEXT: s_cmpk_eq_i32 s0, 0x40f
+; VI-SAFE-GISEL-NEXT: s_cselect_b32 s0, s2, s1
+; VI-SAFE-GISEL-NEXT: s_lshr_b32 s1, s7, 16
+; VI-SAFE-GISEL-NEXT: s_and_b32 s1, s1, 0x8000
+; VI-SAFE-GISEL-NEXT: s_or_b32 s0, s1, s0
+; VI-SAFE-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-SAFE-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-SAFE-GISEL-NEXT: s_mov_b32 s7, 0xf000
+; VI-SAFE-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-SAFE-GISEL-NEXT: s_endpgm
;
; VI-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16:
; VI-UNSAFE-SDAG: ; %bb.0:
-; VI-UNSAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-UNSAFE-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-UNSAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; VI-UNSAFE-SDAG-NEXT: s_mov_b32 s3, 0xf000
-; VI-UNSAFE-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[6:7]
+; VI-UNSAFE-SDAG-NEXT: s_mov_b32 s7, 0xf000
+; VI-UNSAFE-SDAG-NEXT: s_mov_b32 s6, -1
; VI-UNSAFE-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-UNSAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-UNSAFE-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-UNSAFE-SDAG-NEXT: s_endpgm
;
; VI-UNSAFE-GISEL-LABEL: fptrunc_f64_to_f16:
; VI-UNSAFE-GISEL: ; %bb.0:
-; VI-UNSAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-UNSAFE-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-UNSAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-UNSAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; VI-UNSAFE-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-UNSAFE-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-UNSAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[6:7]
+; VI-UNSAFE-GISEL-NEXT: s_mov_b32 s6, -1
+; VI-UNSAFE-GISEL-NEXT: s_mov_b32 s7, 0xf000
; VI-UNSAFE-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-UNSAFE-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-UNSAFE-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-UNSAFE-GISEL-NEXT: s_endpgm
;
; GFX10-SAFE-SDAG-LABEL: fptrunc_f64_to_f16:
; GFX10-SAFE-SDAG: ; %bb.0:
-; GFX10-SAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SAFE-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-SAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SAFE-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff
-; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s5, s3, 8
-; GFX10-SAFE-SDAG-NEXT: s_or_b32 s2, s4, s2
-; GFX10-SAFE-SDAG-NEXT: s_and_b32 s4, s5, 0xffe
-; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s2, 0
-; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s2, -1, 0
-; GFX10-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
-; GFX10-SAFE-SDAG-NEXT: s_bfe_u32 s2, s3, 0xb0014
-; GFX10-SAFE-SDAG-NEXT: s_sub_i32 s5, 0x3f1, s2
-; GFX10-SAFE-SDAG-NEXT: s_addk_i32 s2, 0xfc10
-; GFX10-SAFE-SDAG-NEXT: v_med3_i32 v1, s5, 0, 13
-; GFX10-SAFE-SDAG-NEXT: v_readfirstlane_b32 s5, v0
-; GFX10-SAFE-SDAG-NEXT: s_lshl_b32 s7, s2, 12
-; GFX10-SAFE-SDAG-NEXT: v_readfirstlane_b32 s6, v1
-; GFX10-SAFE-SDAG-NEXT: s_or_b32 s4, s4, s5
-; GFX10-SAFE-SDAG-NEXT: s_or_b32 s5, s4, 0x1000
-; GFX10-SAFE-SDAG-NEXT: s_or_b32 s7, s4, s7
-; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s6, s5, s6
-; GFX10-SAFE-SDAG-NEXT: v_lshlrev_b32_e64 v0, v1, s6
-; GFX10-SAFE-SDAG-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0
+; GFX10-SAFE-SDAG-NEXT: s_and_b32 s0, s7, 0x1ff
+; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s1, s7, 8
+; GFX10-SAFE-SDAG-NEXT: s_or_b32 s0, s0, s6
+; GFX10-SAFE-SDAG-NEXT: s_and_b32 s1, s1, 0xffe
+; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s0, 0
+; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s0, -1, 0
+; GFX10-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX10-SAFE-SDAG-NEXT: s_bfe_u32 s0, s7, 0xb0014
+; GFX10-SAFE-SDAG-NEXT: s_sub_i32 s2, 0x3f1, s0
+; GFX10-SAFE-SDAG-NEXT: s_addk_i32 s0, 0xfc10
+; GFX10-SAFE-SDAG-NEXT: v_med3_i32 v1, s2, 0, 13
+; GFX10-SAFE-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10-SAFE-SDAG-NEXT: s_lshl_b32 s6, s0, 12
+; GFX10-SAFE-SDAG-NEXT: v_readfirstlane_b32 s3, v1
+; GFX10-SAFE-SDAG-NEXT: s_or_b32 s1, s1, s2
+; GFX10-SAFE-SDAG-NEXT: s_or_b32 s2, s1, 0x1000
+; GFX10-SAFE-SDAG-NEXT: s_or_b32 s6, s1, s6
+; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s3, s2, s3
+; GFX10-SAFE-SDAG-NEXT: v_lshlrev_b32_e64 v0, v1, s3
+; GFX10-SAFE-SDAG-NEXT: v_cmp_ne_u32_e32 vcc_lo, s2, v0
; GFX10-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-SAFE-SDAG-NEXT: v_readfirstlane_b32 s5, v0
-; GFX10-SAFE-SDAG-NEXT: s_or_b32 s5, s6, s5
-; GFX10-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 1
-; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, s7
-; GFX10-SAFE-SDAG-NEXT: s_and_b32 s6, s5, 7
-; GFX10-SAFE-SDAG-NEXT: s_cmp_gt_i32 s6, 5
-; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s7, -1, 0
-; GFX10-SAFE-SDAG-NEXT: s_cmp_eq_u32 s6, 3
+; GFX10-SAFE-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX10-SAFE-SDAG-NEXT: s_or_b32 s2, s3, s2
+; GFX10-SAFE-SDAG-NEXT: s_cmp_lt_i32 s0, 1
+; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s2, s2, s6
+; GFX10-SAFE-SDAG-NEXT: s_and_b32 s3, s2, 7
+; GFX10-SAFE-SDAG-NEXT: s_cmp_gt_i32 s3, 5
; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s6, -1, 0
-; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s5, s5, 2
-; GFX10-SAFE-SDAG-NEXT: s_or_b32 s6, s6, s7
-; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s6, 0
-; GFX10-SAFE-SDAG-NEXT: s_addc_u32 s5, s5, 0
-; GFX10-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 31
-; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, 0x7c00
-; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0
-; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s4, -1, 0
-; GFX10-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s2, 0x40f
-; GFX10-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX10-SAFE-SDAG-NEXT: s_cmp_eq_u32 s3, 3
+; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s3, -1, 0
+; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s2, s2, 2
+; GFX10-SAFE-SDAG-NEXT: s_or_b32 s3, s3, s6
+; GFX10-SAFE-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s3, 0
+; GFX10-SAFE-SDAG-NEXT: s_addc_u32 s2, s2, 0
+; GFX10-SAFE-SDAG-NEXT: s_cmp_lt_i32 s0, 31
+; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s2, s2, 0x7c00
+; GFX10-SAFE-SDAG-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 s1, -1, 0
+; GFX10-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s0, 0x40f
+; GFX10-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1
; GFX10-SAFE-SDAG-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s2, s3, 16
-; GFX10-SAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-SAFE-SDAG-NEXT: s_and_b32 s2, s2, 0x8000
+; GFX10-SAFE-SDAG-NEXT: s_lshr_b32 s0, s7, 16
+; GFX10-SAFE-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-SAFE-SDAG-NEXT: s_and_b32 s0, s0, 0x8000
; GFX10-SAFE-SDAG-NEXT: v_lshlrev_b32_e32 v0, 9, v0
; GFX10-SAFE-SDAG-NEXT: v_or_b32_e32 v0, 0x7c00, v0
-; GFX10-SAFE-SDAG-NEXT: v_cndmask_b32_e32 v0, s5, v0, vcc_lo
-; GFX10-SAFE-SDAG-NEXT: v_or_b32_e32 v0, s2, v0
-; GFX10-SAFE-SDAG-NEXT: s_mov_b32 s2, -1
-; GFX10-SAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX10-SAFE-SDAG-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
+; GFX10-SAFE-SDAG-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX10-SAFE-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX10-SAFE-SDAG-NEXT: s_endpgm
;
; GFX10-SAFE-GISEL-LABEL: fptrunc_f64_to_f16:
; GFX10-SAFE-GISEL: ; %bb.0:
-; GFX10-SAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SAFE-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-SAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SAFE-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
-; GFX10-SAFE-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
-; GFX10-SAFE-GISEL-NEXT: s_lshr_b32 s5, s3, 8
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s2, s6, s2
-; GFX10-SAFE-GISEL-NEXT: s_addk_i32 s4, 0xfc10
-; GFX10-SAFE-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX10-SAFE-GISEL-NEXT: s_and_b32 s2, s7, 0x1ff
+; GFX10-SAFE-GISEL-NEXT: s_bfe_u32 s0, s7, 0xb0014
+; GFX10-SAFE-GISEL-NEXT: s_lshr_b32 s1, s7, 8
+; GFX10-SAFE-GISEL-NEXT: s_or_b32 s2, s2, s6
+; GFX10-SAFE-GISEL-NEXT: s_addk_i32 s0, 0xfc10
+; GFX10-SAFE-GISEL-NEXT: s_and_b32 s1, s1, 0xffe
; GFX10-SAFE-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s2, 1, 0
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s2, s5, s2
-; GFX10-SAFE-GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s5, 1, 0
-; GFX10-SAFE-GISEL-NEXT: s_sub_i32 s6, 1, s4
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s8, s2, 0x1000
-; GFX10-SAFE-GISEL-NEXT: s_max_i32 s6, s6, 0
-; GFX10-SAFE-GISEL-NEXT: s_lshl_b32 s7, s4, 12
-; GFX10-SAFE-GISEL-NEXT: s_min_i32 s6, s6, 13
-; GFX10-SAFE-GISEL-NEXT: s_lshl_b32 s5, s5, 9
-; GFX10-SAFE-GISEL-NEXT: s_lshr_b32 s9, s8, s6
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s2, s2, s7
-; GFX10-SAFE-GISEL-NEXT: s_lshl_b32 s6, s9, s6
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00
-; GFX10-SAFE-GISEL-NEXT: s_cmp_lg_u32 s6, s8
-; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s6, 1, 0
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s6, s9, s6
-; GFX10-SAFE-GISEL-NEXT: s_cmp_lt_i32 s4, 1
-; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s2, s6, s2
-; GFX10-SAFE-GISEL-NEXT: s_and_b32 s6, s2, 7
-; GFX10-SAFE-GISEL-NEXT: s_lshr_b32 s2, s2, 2
-; GFX10-SAFE-GISEL-NEXT: s_cmp_eq_u32 s6, 3
-; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s7, 1, 0
-; GFX10-SAFE-GISEL-NEXT: s_cmp_gt_i32 s6, 5
+; GFX10-SAFE-GISEL-NEXT: s_or_b32 s1, s1, s2
+; GFX10-SAFE-GISEL-NEXT: s_cmp_lg_u32 s1, 0
+; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GFX10-SAFE-GISEL-NEXT: s_sub_i32 s3, 1, s0
+; GFX10-SAFE-GISEL-NEXT: s_or_b32 s8, s1, 0x1000
+; GFX10-SAFE-GISEL-NEXT: s_max_i32 s3, s3, 0
+; GFX10-SAFE-GISEL-NEXT: s_lshl_b32 s6, s0, 12
+; GFX10-SAFE-GISEL-NEXT: s_min_i32 s3, s3, 13
+; GFX10-SAFE-GISEL-NEXT: s_lshl_b32 s2, s2, 9
+; GFX10-SAFE-GISEL-NEXT: s_lshr_b32 s9, s8, s3
+; GFX10-SAFE-GISEL-NEXT: s_or_b32 s1, s1, s6
+; GFX10-SAFE-GISEL-NEXT: s_lshl_b32 s3, s9, s3
+; GFX10-SAFE-GISEL-NEXT: s_or_b32 s2, s2, 0x7c00
+; GFX10-SAFE-GISEL-NEXT: s_cmp_lg_u32 s3, s8
+; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s3, 1, 0
+; GFX10-SAFE-GISEL-NEXT: s_or_b32 s3, s9, s3
+; GFX10-SAFE-GISEL-NEXT: s_cmp_lt_i32 s0, 1
+; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s1, s3, s1
+; GFX10-SAFE-GISEL-NEXT: s_and_b32 s3, s1, 7
+; GFX10-SAFE-GISEL-NEXT: s_lshr_b32 s1, s1, 2
+; GFX10-SAFE-GISEL-NEXT: s_cmp_eq_u32 s3, 3
; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s6, 1, 0
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s6, s7, s6
-; GFX10-SAFE-GISEL-NEXT: s_and_b32 s6, s6, 1
-; GFX10-SAFE-GISEL-NEXT: s_add_i32 s2, s2, s6
-; GFX10-SAFE-GISEL-NEXT: s_cmp_gt_i32 s4, 30
-; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2
-; GFX10-SAFE-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f
-; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s2, s5, s2
-; GFX10-SAFE-GISEL-NEXT: s_lshr_b32 s3, s3, 16
-; GFX10-SAFE-GISEL-NEXT: s_and_b32 s3, s3, 0x8000
-; GFX10-SAFE-GISEL-NEXT: s_or_b32 s2, s3, s2
-; GFX10-SAFE-GISEL-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-SAFE-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-SAFE-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX10-SAFE-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX10-SAFE-GISEL-NEXT: s_cmp_gt_i32 s3, 5
+; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s3, 1, 0
+; GFX10-SAFE-GISEL-NEXT: s_or_b32 s3, s6, s3
+; GFX10-SAFE-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX10-SAFE-GISEL-NEXT: s_and_b32 s3, s3, 1
+; GFX10-SAFE-GISEL-NEXT: s_add_i32 s1, s1, s3
+; GFX10-SAFE-GISEL-NEXT: s_cmp_gt_i32 s0, 30
+; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s1, 0x7c00, s1
+; GFX10-SAFE-GISEL-NEXT: s_cmpk_eq_i32 s0, 0x40f
+; GFX10-SAFE-GISEL-NEXT: s_cselect_b32 s0, s2, s1
+; GFX10-SAFE-GISEL-NEXT: s_lshr_b32 s1, s7, 16
+; GFX10-SAFE-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-SAFE-GISEL-NEXT: s_and_b32 s1, s1, 0x8000
+; GFX10-SAFE-GISEL-NEXT: s_or_b32 s0, s1, s0
+; GFX10-SAFE-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-SAFE-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX10-SAFE-GISEL-NEXT: s_endpgm
;
; GFX10-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16:
; GFX10-UNSAFE-SDAG: ; %bb.0:
-; GFX10-UNSAFE-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-UNSAFE-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-UNSAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; GFX10-UNSAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-UNSAFE-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX10-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[6:7]
+; GFX10-UNSAFE-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-UNSAFE-SDAG-NEXT: s_mov_b32 s6, -1
; GFX10-UNSAFE-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX10-UNSAFE-SDAG-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX10-UNSAFE-SDAG-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX10-UNSAFE-SDAG-NEXT: s_endpgm
;
; GFX10-UNSAFE-GISEL-LABEL: fptrunc_f64_to_f16:
; GFX10-UNSAFE-GISEL: ; %bb.0:
-; GFX10-UNSAFE-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-UNSAFE-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-UNSAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-UNSAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; GFX10-UNSAFE-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX10-UNSAFE-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-UNSAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[6:7]
+; GFX10-UNSAFE-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX10-UNSAFE-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX10-UNSAFE-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX10-UNSAFE-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
+; GFX10-UNSAFE-GISEL-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX10-UNSAFE-GISEL-NEXT: s_endpgm
;
; GFX11-SAFE-SDAG-LABEL: fptrunc_f64_to_f16:
; GFX11-SAFE-SDAG: ; %bb.0:
-; GFX11-SAFE-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SAFE-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SAFE-SDAG-NEXT: s_and_b32 s4, s3, 0x1ff
-; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s5, s3, 8
-; GFX11-SAFE-SDAG-NEXT: s_or_b32 s2, s4, s2
-; GFX11-SAFE-SDAG-NEXT: s_and_b32 s4, s5, 0xffe
-; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; GFX11-SAFE-SDAG-NEXT: s_and_b32 s0, s7, 0x1ff
+; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s1, s7, 8
+; GFX11-SAFE-SDAG-NEXT: s_or_b32 s0, s0, s6
+; GFX11-SAFE-SDAG-NEXT: s_and_b32 s1, s1, 0xffe
+; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s0, 0
+; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s0, -1, 0
; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
-; GFX11-SAFE-SDAG-NEXT: s_bfe_u32 s2, s3, 0xb0014
-; GFX11-SAFE-SDAG-NEXT: s_sub_i32 s5, 0x3f1, s2
-; GFX11-SAFE-SDAG-NEXT: s_addk_i32 s2, 0xfc10
-; GFX11-SAFE-SDAG-NEXT: v_med3_i32 v1, s5, 0, 13
-; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s5, v0
-; GFX11-SAFE-SDAG-NEXT: s_lshl_b32 s7, s2, 12
+; GFX11-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-SAFE-SDAG-NEXT: s_bfe_u32 s0, s7, 0xb0014
+; GFX11-SAFE-SDAG-NEXT: s_sub_i32 s2, 0x3f1, s0
+; GFX11-SAFE-SDAG-NEXT: s_addk_i32 s0, 0xfc10
+; GFX11-SAFE-SDAG-NEXT: v_med3_i32 v1, s2, 0, 13
+; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11-SAFE-SDAG-NEXT: s_lshl_b32 s6, s0, 12
; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s6, v1
-; GFX11-SAFE-SDAG-NEXT: s_or_b32 s4, s4, s5
+; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s3, v1
+; GFX11-SAFE-SDAG-NEXT: s_or_b32 s1, s1, s2
; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-SAFE-SDAG-NEXT: s_or_b32 s5, s4, 0x1000
-; GFX11-SAFE-SDAG-NEXT: s_or_b32 s7, s4, s7
-; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s6, s5, s6
+; GFX11-SAFE-SDAG-NEXT: s_or_b32 s2, s1, 0x1000
+; GFX11-SAFE-SDAG-NEXT: s_or_b32 s6, s1, s6
+; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s3, s2, s3
; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-SDAG-NEXT: v_lshlrev_b32_e64 v0, v1, s6
-; GFX11-SAFE-SDAG-NEXT: v_cmp_ne_u32_e32 vcc_lo, s5, v0
+; GFX11-SAFE-SDAG-NEXT: v_lshlrev_b32_e64 v0, v1, s3
+; GFX11-SAFE-SDAG-NEXT: v_cmp_ne_u32_e32 vcc_lo, s2, v0
; GFX11-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s5, v0
-; GFX11-SAFE-SDAG-NEXT: s_or_b32 s5, s6, s5
-; GFX11-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 1
-; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, s7
+; GFX11-SAFE-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11-SAFE-SDAG-NEXT: s_or_b32 s2, s3, s2
+; GFX11-SAFE-SDAG-NEXT: s_cmp_lt_i32 s0, 1
+; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s2, s2, s6
; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-SDAG-NEXT: s_and_b32 s6, s5, 7
-; GFX11-SAFE-SDAG-NEXT: s_cmp_gt_i32 s6, 5
-; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s7, -1, 0
-; GFX11-SAFE-SDAG-NEXT: s_cmp_eq_u32 s6, 3
+; GFX11-SAFE-SDAG-NEXT: s_and_b32 s3, s2, 7
+; GFX11-SAFE-SDAG-NEXT: s_cmp_gt_i32 s3, 5
; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s6, -1, 0
-; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s5, s5, 2
-; GFX11-SAFE-SDAG-NEXT: s_or_b32 s6, s6, s7
-; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s6, 0
-; GFX11-SAFE-SDAG-NEXT: s_addc_u32 s5, s5, 0
-; GFX11-SAFE-SDAG-NEXT: s_cmp_lt_i32 s2, 31
-; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s5, s5, 0x7c00
-; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s4, 0
-; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s4, -1, 0
-; GFX11-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s2, 0x40f
-; GFX11-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
+; GFX11-SAFE-SDAG-NEXT: s_cmp_eq_u32 s3, 3
+; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s3, -1, 0
+; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s2, s2, 2
+; GFX11-SAFE-SDAG-NEXT: s_or_b32 s3, s3, s6
+; GFX11-SAFE-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s3, 0
+; GFX11-SAFE-SDAG-NEXT: s_addc_u32 s2, s2, 0
+; GFX11-SAFE-SDAG-NEXT: s_cmp_lt_i32 s0, 31
+; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s2, s2, 0x7c00
+; GFX11-SAFE-SDAG-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 s1, -1, 0
+; GFX11-SAFE-SDAG-NEXT: s_cmpk_eq_i32 s0, 0x40f
+; GFX11-SAFE-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1
; GFX11-SAFE-SDAG-NEXT: s_cselect_b32 vcc_lo, -1, 0
-; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s2, s3, 16
-; GFX11-SAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-SAFE-SDAG-NEXT: s_and_b32 s2, s2, 0x8000
+; GFX11-SAFE-SDAG-NEXT: s_lshr_b32 s0, s7, 16
+; GFX11-SAFE-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-SAFE-SDAG-NEXT: s_and_b32 s0, s0, 0x8000
; GFX11-SAFE-SDAG-NEXT: v_lshlrev_b32_e32 v0, 9, v0
; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SAFE-SDAG-NEXT: v_or_b32_e32 v0, 0x7c00, v0
-; GFX11-SAFE-SDAG-NEXT: v_cndmask_b32_e32 v0, s5, v0, vcc_lo
+; GFX11-SAFE-SDAG-NEXT: v_cndmask_b32_e32 v0, s2, v0, vcc_lo
; GFX11-SAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SAFE-SDAG-NEXT: v_or_b32_e32 v0, s2, v0
-; GFX11-SAFE-SDAG-NEXT: s_mov_b32 s2, -1
-; GFX11-SAFE-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-SAFE-SDAG-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX11-SAFE-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-SAFE-SDAG-NEXT: s_nop 0
; GFX11-SAFE-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SAFE-SDAG-NEXT: s_endpgm
;
; GFX11-SAFE-GISEL-LABEL: fptrunc_f64_to_f16:
; GFX11-SAFE-GISEL: ; %bb.0:
-; GFX11-SAFE-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SAFE-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SAFE-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
-; GFX11-SAFE-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
-; GFX11-SAFE-GISEL-NEXT: s_lshr_b32 s5, s3, 8
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s2, s6, s2
-; GFX11-SAFE-GISEL-NEXT: s_addk_i32 s4, 0xfc10
-; GFX11-SAFE-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
+; GFX11-SAFE-GISEL-NEXT: s_and_b32 s2, s7, 0x1ff
+; GFX11-SAFE-GISEL-NEXT: s_bfe_u32 s0, s7, 0xb0014
+; GFX11-SAFE-GISEL-NEXT: s_lshr_b32 s1, s7, 8
+; GFX11-SAFE-GISEL-NEXT: s_or_b32 s2, s2, s6
+; GFX11-SAFE-GISEL-NEXT: s_addk_i32 s0, 0xfc10
+; GFX11-SAFE-GISEL-NEXT: s_and_b32 s1, s1, 0xffe
; GFX11-SAFE-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s2, 1, 0
; GFX11-SAFE-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s2, s5, s2
-; GFX11-SAFE-GISEL-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s5, 1, 0
-; GFX11-SAFE-GISEL-NEXT: s_sub_i32 s6, 1, s4
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s8, s2, 0x1000
-; GFX11-SAFE-GISEL-NEXT: s_max_i32 s6, s6, 0
-; GFX11-SAFE-GISEL-NEXT: s_lshl_b32 s7, s4, 12
-; GFX11-SAFE-GISEL-NEXT: s_min_i32 s6, s6, 13
-; GFX11-SAFE-GISEL-NEXT: s_lshl_b32 s5, s5, 9
-; GFX11-SAFE-GISEL-NEXT: s_lshr_b32 s9, s8, s6
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s2, s2, s7
-; GFX11-SAFE-GISEL-NEXT: s_lshl_b32 s6, s9, s6
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00
-; GFX11-SAFE-GISEL-NEXT: s_cmp_lg_u32 s6, s8
-; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s6, 1, 0
+; GFX11-SAFE-GISEL-NEXT: s_or_b32 s1, s1, s2
+; GFX11-SAFE-GISEL-NEXT: s_cmp_lg_u32 s1, 0
+; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s2, 1, 0
+; GFX11-SAFE-GISEL-NEXT: s_sub_i32 s3, 1, s0
+; GFX11-SAFE-GISEL-NEXT: s_or_b32 s8, s1, 0x1000
+; GFX11-SAFE-GISEL-NEXT: s_max_i32 s3, s3, 0
+; GFX11-SAFE-GISEL-NEXT: s_lshl_b32 s6, s0, 12
+; GFX11-SAFE-GISEL-NEXT: s_min_i32 s3, s3, 13
+; GFX11-SAFE-GISEL-NEXT: s_lshl_b32 s2, s2, 9
+; GFX11-SAFE-GISEL-NEXT: s_lshr_b32 s9, s8, s3
+; GFX11-SAFE-GISEL-NEXT: s_or_b32 s1, s1, s6
+; GFX11-SAFE-GISEL-NEXT: s_lshl_b32 s3, s9, s3
+; GFX11-SAFE-GISEL-NEXT: s_or_b32 s2, s2, 0x7c00
+; GFX11-SAFE-GISEL-NEXT: s_cmp_lg_u32 s3, s8
+; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s3, 1, 0
; GFX11-SAFE-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s6, s9, s6
-; GFX11-SAFE-GISEL-NEXT: s_cmp_lt_i32 s4, 1
-; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s2, s6, s2
-; GFX11-SAFE-GISEL-NEXT: s_and_b32 s6, s2, 7
-; GFX11-SAFE-GISEL-NEXT: s_lshr_b32 s2, s2, 2
-; GFX11-SAFE-GISEL-NEXT: s_cmp_eq_u32 s6, 3
-; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s7, 1, 0
-; GFX11-SAFE-GISEL-NEXT: s_cmp_gt_i32 s6, 5
+; GFX11-SAFE-GISEL-NEXT: s_or_b32 s3, s9, s3
+; GFX11-SAFE-GISEL-NEXT: s_cmp_lt_i32 s0, 1
+; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s1, s3, s1
+; GFX11-SAFE-GISEL-NEXT: s_and_b32 s3, s1, 7
+; GFX11-SAFE-GISEL-NEXT: s_lshr_b32 s1, s1, 2
+; GFX11-SAFE-GISEL-NEXT: s_cmp_eq_u32 s3, 3
; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s6, 1, 0
+; GFX11-SAFE-GISEL-NEXT: s_cmp_gt_i32 s3, 5
+; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s3, 1, 0
+; GFX11-SAFE-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-SAFE-GISEL-NEXT: s_or_b32 s3, s6, s3
+; GFX11-SAFE-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-SAFE-GISEL-NEXT: s_and_b32 s3, s3, 1
+; GFX11-SAFE-GISEL-NEXT: s_add_i32 s1, s1, s3
+; GFX11-SAFE-GISEL-NEXT: s_cmp_gt_i32 s0, 30
+; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s1, 0x7c00, s1
+; GFX11-SAFE-GISEL-NEXT: s_cmpk_eq_i32 s0, 0x40f
+; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s0, s2, s1
+; GFX11-SAFE-GISEL-NEXT: s_lshr_b32 s1, s7, 16
+; GFX11-SAFE-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-SAFE-GISEL-NEXT: s_and_b32 s1, s1, 0x8000
; GFX11-SAFE-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s6, s7, s6
-; GFX11-SAFE-GISEL-NEXT: s_and_b32 s6, s6, 1
-; GFX11-SAFE-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-SAFE-GISEL-NEXT: s_add_i32 s2, s2, s6
-; GFX11-SAFE-GISEL-NEXT: s_cmp_gt_i32 s4, 30
-; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2
-; GFX11-SAFE-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f
-; GFX11-SAFE-GISEL-NEXT: s_cselect_b32 s2, s5, s2
-; GFX11-SAFE-GISEL-NEXT: s_lshr_b32 s3, s3, 16
-; GFX11-SAFE-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-SAFE-GISEL-NEXT: s_and_b32 s3, s3, 0x8000
-; GFX11-SAFE-GISEL-NEXT: s_or_b32 s2, s3, s2
-; GFX11-SAFE-GISEL-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-SAFE-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-SAFE-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-SAFE-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-SAFE-GISEL-NEXT: s_or_b32 s0, s1, s0
+; GFX11-SAFE-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-SAFE-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-SAFE-GISEL-NEXT: s_nop 0
; GFX11-SAFE-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SAFE-GISEL-NEXT: s_endpgm
;
; GFX11-UNSAFE-SDAG-LABEL: fptrunc_f64_to_f16:
; GFX11-UNSAFE-SDAG: ; %bb.0:
-; GFX11-UNSAFE-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-UNSAFE-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-UNSAFE-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; GFX11-UNSAFE-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-UNSAFE-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX11-UNSAFE-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[6:7]
+; GFX11-UNSAFE-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-UNSAFE-SDAG-NEXT: s_mov_b32 s6, -1
; GFX11-UNSAFE-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-UNSAFE-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-UNSAFE-SDAG-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-UNSAFE-SDAG-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-UNSAFE-SDAG-NEXT: s_nop 0
; GFX11-UNSAFE-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-UNSAFE-SDAG-NEXT: s_endpgm
;
; GFX11-UNSAFE-GISEL-LABEL: fptrunc_f64_to_f16:
; GFX11-UNSAFE-GISEL: ; %bb.0:
-; GFX11-UNSAFE-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-UNSAFE-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-UNSAFE-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-UNSAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
-; GFX11-UNSAFE-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-UNSAFE-GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-UNSAFE-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[6:7]
+; GFX11-UNSAFE-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-UNSAFE-GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-UNSAFE-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-UNSAFE-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-UNSAFE-GISEL-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-UNSAFE-GISEL-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-UNSAFE-GISEL-NEXT: s_nop 0
; GFX11-UNSAFE-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-UNSAFE-GISEL-NEXT: s_endpgm
@@ -595,79 +593,79 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(ptr addrspace(1) %out, <2 x do
; VI-SDAG-LABEL: fptrunc_v2f64_to_v2f32:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s11, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s10, -1
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; VI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fptrunc_v2f64_to_v2f32:
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; VI-GISEL-NEXT: s_mov_b32 s10, -1
+; VI-GISEL-NEXT: s_mov_b32 s11, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
-; VI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: fptrunc_v2f64_to_v2f32:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_clause 0x1
; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-SDAG-NEXT: s_mov_b32 s2, -1
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; GFX10-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX10-SDAG-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: fptrunc_v2f64_to_v2f32:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
-; GFX10-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX10-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX10-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fptrunc_v2f64_to_v2f32:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; GFX11-SDAG-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fptrunc_v2f64_to_v2f32:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
-; GFX11-GISEL-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -696,87 +694,89 @@ define amdgpu_kernel void @fptrunc_v3f64_to_v3f32(ptr addrspace(1) %out, <3 x do
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54
; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s11, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s10, -1
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[2:3]
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s2, -1
-; VI-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; VI-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[8:11], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fptrunc_v3f64_to_v3f32:
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24
+; VI-GISEL-NEXT: s_mov_b32 s14, -1
+; VI-GISEL-NEXT: s_mov_b32 s15, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[8:9]
-; VI-GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; VI-GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[12:15], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: fptrunc_v3f64_to_v3f32:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_clause 0x2
+; GFX10-SDAG-NEXT: s_clause 0x1
; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x54
; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[2:3]
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-SDAG-NEXT: s_mov_b32 s2, -1
-; GFX10-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
; GFX10-SDAG-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: fptrunc_v3f64_to_v3f32:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[8:9]
-; GFX10-GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX10-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
; GFX10-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fptrunc_v3f64_to_v3f32:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_clause 0x2
+; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x54
; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x44
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[2:3]
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
-; GFX11-SDAG-NEXT: buffer_store_b96 v[0:2], off, s[0:3], 0
+; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: buffer_store_b96 v[0:2], off, s[4:7], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fptrunc_v3f64_to_v3f32:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x44
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[8:9]
-; GFX11-GISEL-NEXT: buffer_store_b96 v[0:2], off, s[0:3], 0
+; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: buffer_store_b96 v[0:2], off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -803,91 +803,91 @@ define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(ptr addrspace(1) %out, <4 x do
; VI-SDAG-LABEL: fptrunc_v4f64_to_v4f32:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s15, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s14, -1
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v3, s[10:11]
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[8:9]
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; VI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fptrunc_v4f64_to_v4f32:
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24
+; VI-GISEL-NEXT: s_mov_b32 s14, -1
+; VI-GISEL-NEXT: s_mov_b32 s15, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[8:9]
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[10:11]
-; VI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
; VI-GISEL-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: fptrunc_v4f64_to_v4f32:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_clause 0x1
; GFX10-SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-SDAG-NEXT: s_mov_b32 s2, -1
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v3, s[10:11]
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[8:9]
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; GFX10-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX10-SDAG-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: fptrunc_v4f64_to_v4f32:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[8:9]
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[10:11]
-; GFX10-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX10-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX10-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fptrunc_v4f64_to_v4f32:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: s_load_b256 s[4:11], s[0:1], 0x44
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v3, s[10:11]
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[8:9]
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; GFX11-SDAG-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fptrunc_v4f64_to_v4f32:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x44
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v2, s[8:9]
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v3, s[10:11]
-; GFX11-GISEL-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -919,9 +919,9 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do
; VI-SDAG-LABEL: fptrunc_v8f64_to_v8f32:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-SDAG-NEXT: s_mov_b32 s3, 0xf000
-; VI-SDAG-NEXT: s_mov_b32 s2, -1
+; VI-SDAG-NEXT: s_load_dwordx2 s[20:21], s[0:1], 0x24
+; VI-SDAG-NEXT: s_mov_b32 s23, 0xf000
+; VI-SDAG-NEXT: s_mov_b32 s22, -1
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v7, s[18:19]
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v6, s[16:17]
@@ -931,16 +931,16 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[8:9]
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; VI-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; VI-SDAG-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; VI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-SDAG-NEXT: buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16
+; VI-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: fptrunc_v8f64_to_v8f32:
; VI-GISEL: ; %bb.0:
; VI-GISEL-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-GISEL-NEXT: s_mov_b32 s2, -1
-; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
+; VI-GISEL-NEXT: s_load_dwordx2 s[20:21], s[0:1], 0x24
+; VI-GISEL-NEXT: s_mov_b32 s22, -1
+; VI-GISEL-NEXT: s_mov_b32 s23, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
@@ -950,17 +950,13 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v5, s[14:15]
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v6, s[16:17]
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v7, s[18:19]
-; VI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-; VI-GISEL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; VI-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[20:23], 0
+; VI-GISEL-NEXT: buffer_store_dwordx4 v[4:7], off, s[20:23], 0 offset:16
; VI-GISEL-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: fptrunc_v8f64_to_v8f32:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_clause 0x1
; GFX10-SDAG-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-SDAG-NEXT: s_mov_b32 s2, -1
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v7, s[18:19]
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v6, s[16:17]
@@ -970,17 +966,17 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[8:9]
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX10-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
-; GFX10-SDAG-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; GFX10-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GFX10-SDAG-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX10-SDAG-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: fptrunc_v8f64_to_v8f32:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX10-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
@@ -990,17 +986,17 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v5, s[14:15]
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v6, s[16:17]
; GFX10-GISEL-NEXT: v_cvt_f32_f64_e32 v7, s[18:19]
-; GFX10-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-; GFX10-GISEL-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX10-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX10-GISEL-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
; GFX10-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: fptrunc_v8f64_to_v8f32:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: s_load_b512 s[4:19], s[0:1], 0x64
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-SDAG-NEXT: s_mov_b32 s2, -1
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v7, s[18:19]
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v6, s[16:17]
@@ -1010,20 +1006,20 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v2, s[8:9]
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX11-SDAG-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
+; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-SDAG-NEXT: s_mov_b32 s6, -1
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: s_clause 0x1
-; GFX11-SDAG-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 offset:16
-; GFX11-SDAG-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-SDAG-NEXT: buffer_store_b128 v[4:7], off, s[4:7], 0 offset:16
+; GFX11-SDAG-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: fptrunc_v8f64_to_v8f32:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b512 s[4:19], s[0:1], 0x64
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-GISEL-NEXT: s_mov_b32 s2, -1
-; GFX11-GISEL-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
@@ -1033,9 +1029,13 @@ define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(ptr addrspace(1) %out, <8 x do
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v5, s[14:15]
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v6, s[16:17]
; GFX11-GISEL-NEXT: v_cvt_f32_f64_e32 v7, s[18:19]
+; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_mov_b32 s6, -1
+; GFX11-GISEL-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: s_clause 0x1
-; GFX11-GISEL-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
-; GFX11-GISEL-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 offset:16
+; GFX11-GISEL-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
+; GFX11-GISEL-NEXT: buffer_store_b128 v[4:7], off, s[4:7], 0 offset:16
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 0d59021b69019..c7e284d095c5d 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -93,12 +93,12 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; VI-LABEL: frem_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: s_add_u32 s0, s0, 8
+; VI-NEXT: s_add_u32 s0, s2, 8
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: flat_load_ushort v4, v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -160,12 +160,12 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
-; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] offset:8
+; GFX11-NEXT: global_load_u16 v2, v0, s[2:3] offset:8
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -185,12 +185,12 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v0, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_u16 v1, v0, s[6:7]
-; GFX1150-NEXT: global_load_u16 v2, v0, s[0:1] offset:8
+; GFX1150-NEXT: global_load_u16 v2, v0, s[2:3] offset:8
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: v_cvt_f32_f16_e32 v3, v2
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
@@ -277,12 +277,12 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
; VI-LABEL: fast_frem_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: s_add_u32 s0, s0, 8
+; VI-NEXT: s_add_u32 s0, s2, 8
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: flat_load_ushort v4, v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -335,12 +335,12 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
-; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] offset:8
+; GFX11-NEXT: global_load_u16 v2, v0, s[2:3] offset:8
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f16_e32 v3, v2
; GFX11-NEXT: s_waitcnt_depctr 0xfff
@@ -357,12 +357,12 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v0, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_u16 v1, v0, s[6:7]
-; GFX1150-NEXT: global_load_u16 v2, v0, s[0:1] offset:8
+; GFX1150-NEXT: global_load_u16 v2, v0, s[2:3] offset:8
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: v_rcp_f16_e32 v3, v2
; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -446,12 +446,12 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
; VI-LABEL: unsafe_frem_f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: s_add_u32 s0, s0, 8
+; VI-NEXT: s_add_u32 s0, s2, 8
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: flat_load_ushort v4, v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -504,12 +504,12 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
-; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] offset:8
+; GFX11-NEXT: global_load_u16 v2, v0, s[2:3] offset:8
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f16_e32 v3, v2
; GFX11-NEXT: s_waitcnt_depctr 0xfff
@@ -526,12 +526,12 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v0, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_u16 v1, v0, s[6:7]
-; GFX1150-NEXT: global_load_u16 v2, v0, s[0:1] offset:8
+; GFX1150-NEXT: global_load_u16 v2, v0, s[2:3] offset:8
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: v_rcp_f16_e32 v3, v2
; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -629,12 +629,12 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
; VI-LABEL: frem_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: s_add_u32 s0, s0, 16
+; VI-NEXT: s_add_u32 s0, s2, 16
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: flat_load_dword v4, v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -720,12 +720,12 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] offset:16
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_div_scale_f32 v4, null, v2, v2, v1
; GFX11-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1
@@ -757,12 +757,12 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v0, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX1150-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
+; GFX1150-NEXT: global_load_b32 v2, v0, s[2:3] offset:16
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: v_div_scale_f32 v4, null, v2, v2, v1
; GFX1150-NEXT: v_div_scale_f32 v3, vcc_lo, v1, v2, v1
@@ -853,12 +853,12 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1)
; VI-LABEL: fast_frem_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: s_add_u32 s0, s0, 16
+; VI-NEXT: s_add_u32 s0, s2, 16
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: flat_load_dword v4, v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -911,12 +911,12 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] offset:16
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-NEXT: s_waitcnt_depctr 0xfff
@@ -933,12 +933,12 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1)
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v0, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX1150-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
+; GFX1150-NEXT: global_load_b32 v2, v0, s[2:3] offset:16
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: v_rcp_f32_e32 v3, v2
; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1014,12 +1014,12 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
; VI-LABEL: unsafe_frem_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: s_add_u32 s0, s0, 16
+; VI-NEXT: s_add_u32 s0, s2, 16
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: flat_load_dword v4, v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -1072,12 +1072,12 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] offset:16
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f32_e32 v3, v2
; GFX11-NEXT: s_waitcnt_depctr 0xfff
@@ -1094,12 +1094,12 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v0, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX1150-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
+; GFX1150-NEXT: global_load_b32 v2, v0, s[2:3] offset:16
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: v_rcp_f32_e32 v3, v2
; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1218,12 +1218,12 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
; VI-LABEL: frem_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
; VI-NEXT: v_mov_b32_e32 v0, s4
@@ -1303,12 +1303,12 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v12, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[0:1], v12, s[6:7]
-; GFX11-NEXT: global_load_b64 v[2:3], v12, s[0:1]
+; GFX11-NEXT: global_load_b64 v[2:3], v12, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -1338,12 +1338,12 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v12, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_b64 v[0:1], v12, s[6:7]
-; GFX1150-NEXT: global_load_b64 v[2:3], v12, s[0:1]
+; GFX1150-NEXT: global_load_b64 v[2:3], v12, s[2:3]
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1]
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
@@ -1462,12 +1462,12 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
; VI-LABEL: fast_frem_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
; VI-NEXT: v_mov_b32_e32 v0, s4
@@ -1536,12 +1536,12 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v10, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[0:1], v10, s[6:7]
-; GFX11-NEXT: global_load_b64 v[2:3], v10, s[0:1]
+; GFX11-NEXT: global_load_b64 v[2:3], v10, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; GFX11-NEXT: s_waitcnt_depctr 0xfff
@@ -1567,12 +1567,12 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v10, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_b64 v[0:1], v10, s[6:7]
-; GFX1150-NEXT: global_load_b64 v[2:3], v10, s[0:1]
+; GFX1150-NEXT: global_load_b64 v[2:3], v10, s[2:3]
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1687,12 +1687,12 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
; VI-LABEL: unsafe_frem_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
; VI-NEXT: v_mov_b32_e32 v0, s4
@@ -1761,12 +1761,12 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v10, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[0:1], v10, s[6:7]
-; GFX11-NEXT: global_load_b64 v[2:3], v10, s[0:1]
+; GFX11-NEXT: global_load_b64 v[2:3], v10, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; GFX11-NEXT: s_waitcnt_depctr 0xfff
@@ -1792,12 +1792,12 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v10, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_b64 v[0:1], v10, s[6:7]
-; GFX1150-NEXT: global_load_b64 v[2:3], v10, s[0:1]
+; GFX1150-NEXT: global_load_b64 v[2:3], v10, s[2:3]
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1956,12 +1956,12 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-LABEL: frem_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: s_add_u32 s0, s0, 16
+; VI-NEXT: s_add_u32 s0, s2, 16
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: flat_load_dword v4, v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -2053,12 +2053,12 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] offset:16
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -2091,12 +2091,12 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v0, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX1150-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
+; GFX1150-NEXT: global_load_b32 v2, v0, s[2:3] offset:16
; GFX1150-NEXT: s_waitcnt vmcnt(1)
; GFX1150-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; GFX1150-NEXT: s_waitcnt vmcnt(0)
@@ -2346,11 +2346,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-LABEL: frem_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: s_add_u32 s0, s0, 32
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_add_u32 s0, s2, 32
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
@@ -2493,12 +2493,12 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7]
-; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:32
+; GFX11-NEXT: global_load_b64 v[2:3], v4, s[2:3] offset:32
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -2553,12 +2553,12 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v4, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_b64 v[0:1], v4, s[6:7]
-; GFX1150-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:32
+; GFX1150-NEXT: global_load_b64 v[2:3], v4, s[2:3] offset:32
; GFX1150-NEXT: s_waitcnt vmcnt(1)
; GFX1150-NEXT: v_lshrrev_b32_e32 v7, 16, v0
; GFX1150-NEXT: s_waitcnt vmcnt(0)
@@ -2728,11 +2728,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-LABEL: frem_v2f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: s_add_u32 s0, s0, 32
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_add_u32 s0, s2, 32
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: v_mov_b32_e32 v4, s0
@@ -2864,12 +2864,12 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7]
-; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:32
+; GFX11-NEXT: global_load_b64 v[2:3], v4, s[2:3] offset:32
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_div_scale_f32 v6, null, v3, v3, v1
; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1
@@ -2922,12 +2922,12 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v4, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_b64 v[0:1], v4, s[6:7]
-; GFX1150-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:32
+; GFX1150-NEXT: global_load_b64 v[2:3], v4, s[2:3] offset:32
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: v_div_scale_f32 v6, null, v3, v3, v1
; GFX1150-NEXT: v_div_scale_f32 v5, vcc_lo, v1, v3, v1
@@ -3152,11 +3152,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-LABEL: frem_v4f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_add_u32 s0, s0, 64
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_add_u32 s0, s2, 64
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_mov_b32_e32 v4, s0
@@ -3378,12 +3378,12 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v8, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b128 v[0:3], v8, s[6:7]
-; GFX11-NEXT: global_load_b128 v[4:7], v8, s[0:1] offset:64
+; GFX11-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:64
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_div_scale_f32 v10, null, v7, v7, v3
; GFX11-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3
@@ -3478,12 +3478,12 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v8, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_b128 v[0:3], v8, s[6:7]
-; GFX1150-NEXT: global_load_b128 v[4:7], v8, s[0:1] offset:64
+; GFX1150-NEXT: global_load_b128 v[4:7], v8, s[2:3] offset:64
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: v_div_scale_f32 v10, null, v7, v7, v3
; GFX1150-NEXT: v_div_scale_f32 v9, vcc_lo, v3, v7, v3
@@ -3731,11 +3731,11 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-LABEL: frem_v2f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_add_u32 s0, s0, 64
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_add_u32 s0, s2, 64
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_mov_b32_e32 v4, s0
@@ -3859,12 +3859,12 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v16, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b128 v[0:3], v16, s[6:7]
-; GFX11-NEXT: global_load_b128 v[4:7], v16, s[0:1] offset:64
+; GFX11-NEXT: global_load_b128 v[4:7], v16, s[2:3] offset:64
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_div_scale_f64 v[8:9], null, v[6:7], v[6:7], v[2:3]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -3913,12 +3913,12 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150: ; %bb.0:
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX1150-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX1150-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX1150-NEXT: v_mov_b32_e32 v16, 0
; GFX1150-NEXT: s_waitcnt lgkmcnt(0)
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_b128 v[0:3], v16, s[6:7]
-; GFX1150-NEXT: global_load_b128 v[4:7], v16, s[0:1] offset:64
+; GFX1150-NEXT: global_load_b128 v[4:7], v16, s[2:3] offset:64
; GFX1150-NEXT: s_waitcnt vmcnt(0)
; GFX1150-NEXT: v_div_scale_f64 v[8:9], null, v[6:7], v[6:7], v[2:3]
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll
index fecf303d57691..2e36b5323d4bd 100644
--- a/llvm/test/CodeGen/AMDGPU/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshl.ll
@@ -210,22 +210,22 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s7
-; VI-NEXT: s_not_b32 s3, s3
-; VI-NEXT: s_lshr_b32 s7, s5, 1
+; VI-NEXT: s_not_b32 s1, s3
+; VI-NEXT: s_lshr_b32 s0, s5, 1
; VI-NEXT: v_alignbit_b32 v0, s5, v0, 1
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_alignbit_b32 v1, s7, v0, v1
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_alignbit_b32 v1, s0, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_not_b32 s2, s2
+; VI-NEXT: s_not_b32 s1, s2
; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1
-; VI-NEXT: s_lshr_b32 s3, s4, 1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_alignbit_b32 v0, s3, v0, v2
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_lshr_b32 s0, s4, 1
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_alignbit_b32 v0, s0, v0, v2
+; VI-NEXT: v_mov_b32_e32 v2, s8
+; VI-NEXT: v_mov_b32_e32 v3, s9
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -293,18 +293,18 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x3c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_alignbit_b32 v0, s5, s7, 1
; GFX11-NEXT: v_alignbit_b32 v3, s4, s6, 1
-; GFX11-NEXT: s_lshr_b32 s5, s5, 1
-; GFX11-NEXT: s_not_b32 s3, s3
-; GFX11-NEXT: s_lshr_b32 s4, s4, 1
+; GFX11-NEXT: s_lshr_b32 s0, s5, 1
+; GFX11-NEXT: s_not_b32 s1, s3
+; GFX11-NEXT: s_lshr_b32 s3, s4, 1
; GFX11-NEXT: s_not_b32 s2, s2
-; GFX11-NEXT: v_alignbit_b32 v1, s5, v0, s3
-; GFX11-NEXT: v_alignbit_b32 v0, s4, v3, s2
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_alignbit_b32 v1, s0, v0, s1
+; GFX11-NEXT: v_alignbit_b32 v0, s3, v3, s2
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[8:9]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -332,14 +332,14 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
; VI-LABEL: fshl_v2i32_imm:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s7
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_alignbit_b32 v1, s5, v0, 23
; VI-NEXT: v_alignbit_b32 v0, s4, v2, 25
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -386,12 +386,12 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, 23
; GFX11-NEXT: v_alignbit_b32 v0, s4, s6, 25
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -441,34 +441,34 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
; VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s11
-; VI-NEXT: s_not_b32 s3, s15
-; VI-NEXT: s_lshr_b32 s2, s7, 1
+; VI-NEXT: s_not_b32 s1, s15
+; VI-NEXT: s_lshr_b32 s0, s7, 1
; VI-NEXT: v_alignbit_b32 v0, s7, v0, 1
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_alignbit_b32 v3, s2, v0, v1
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_alignbit_b32 v3, s0, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s10
-; VI-NEXT: s_not_b32 s3, s14
+; VI-NEXT: s_not_b32 s1, s14
; VI-NEXT: v_alignbit_b32 v0, s6, v0, 1
-; VI-NEXT: s_lshr_b32 s2, s6, 1
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_alignbit_b32 v2, s2, v0, v1
+; VI-NEXT: s_lshr_b32 s0, s6, 1
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_alignbit_b32 v2, s0, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s9
-; VI-NEXT: s_not_b32 s3, s13
+; VI-NEXT: s_not_b32 s1, s13
; VI-NEXT: v_alignbit_b32 v0, s5, v0, 1
-; VI-NEXT: s_lshr_b32 s2, s5, 1
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_alignbit_b32 v1, s2, v0, v1
+; VI-NEXT: s_lshr_b32 s0, s5, 1
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_alignbit_b32 v1, s0, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s8
-; VI-NEXT: s_not_b32 s3, s12
+; VI-NEXT: s_not_b32 s1, s12
; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1
-; VI-NEXT: s_lshr_b32 s2, s4, 1
-; VI-NEXT: v_mov_b32_e32 v4, s3
-; VI-NEXT: v_alignbit_b32 v0, s2, v0, v4
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: s_lshr_b32 s0, s4, 1
+; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: v_alignbit_b32 v0, s0, v0, v4
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -534,29 +534,29 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
;
; GFX10-LABEL: fshl_v4i32:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
; GFX10-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v4, 0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_alignbit_b32 v0, s7, s11, 1
; GFX10-NEXT: v_alignbit_b32 v1, s6, s10, 1
; GFX10-NEXT: v_alignbit_b32 v5, s5, s9, 1
; GFX10-NEXT: v_alignbit_b32 v6, s4, s8, 1
-; GFX10-NEXT: s_lshr_b32 s2, s7, 1
-; GFX10-NEXT: s_not_b32 s3, s15
+; GFX10-NEXT: s_lshr_b32 s0, s7, 1
+; GFX10-NEXT: s_not_b32 s1, s15
; GFX10-NEXT: s_lshr_b32 s6, s6, 1
; GFX10-NEXT: s_not_b32 s7, s14
; GFX10-NEXT: s_lshr_b32 s5, s5, 1
; GFX10-NEXT: s_not_b32 s9, s13
; GFX10-NEXT: s_lshr_b32 s4, s4, 1
; GFX10-NEXT: s_not_b32 s8, s12
-; GFX10-NEXT: v_alignbit_b32 v3, s2, v0, s3
+; GFX10-NEXT: v_alignbit_b32 v3, s0, v0, s1
; GFX10-NEXT: v_alignbit_b32 v2, s6, v1, s7
; GFX10-NEXT: v_alignbit_b32 v1, s5, v5, s9
; GFX10-NEXT: v_alignbit_b32 v0, s4, v6, s8
-; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: fshl_v4i32:
@@ -564,26 +564,26 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34
; GFX11-NEXT: s_load_b128 s[12:15], s[0:1], 0x54
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_alignbit_b32 v0, s7, s11, 1
; GFX11-NEXT: v_alignbit_b32 v1, s6, s10, 1
; GFX11-NEXT: v_alignbit_b32 v5, s5, s9, 1
; GFX11-NEXT: v_alignbit_b32 v6, s4, s8, 1
-; GFX11-NEXT: s_lshr_b32 s2, s7, 1
-; GFX11-NEXT: s_not_b32 s3, s15
+; GFX11-NEXT: s_lshr_b32 s0, s7, 1
+; GFX11-NEXT: s_not_b32 s1, s15
; GFX11-NEXT: s_lshr_b32 s6, s6, 1
; GFX11-NEXT: s_not_b32 s7, s14
; GFX11-NEXT: s_lshr_b32 s5, s5, 1
; GFX11-NEXT: s_not_b32 s9, s13
; GFX11-NEXT: s_lshr_b32 s4, s4, 1
; GFX11-NEXT: s_not_b32 s8, s12
-; GFX11-NEXT: v_alignbit_b32 v3, s2, v0, s3
+; GFX11-NEXT: v_alignbit_b32 v3, s0, v0, s1
; GFX11-NEXT: v_alignbit_b32 v2, s6, v1, s7
; GFX11-NEXT: v_alignbit_b32 v1, s5, v5, s9
; GFX11-NEXT: v_alignbit_b32 v0, s4, v6, s8
-; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -615,7 +615,7 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
; VI-LABEL: fshl_v4i32_imm:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s11
; VI-NEXT: v_mov_b32_e32 v1, s10
@@ -624,17 +624,17 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
; VI-NEXT: v_alignbit_b32 v2, s6, v1, 23
; VI-NEXT: v_alignbit_b32 v1, s5, v4, 25
; VI-NEXT: v_mov_b32_e32 v0, s8
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_alignbit_b32 v0, s4, v0, 31
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fshl_v4i32_imm:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s11
; GFX9-NEXT: v_mov_b32_e32 v1, s10
@@ -644,7 +644,7 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 25
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 31
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: fshl_v4i32_imm:
@@ -683,14 +683,14 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_alignbit_b32 v3, s7, s11, 31
; GFX11-NEXT: v_alignbit_b32 v2, s6, s10, 23
; GFX11-NEXT: v_alignbit_b32 v1, s5, s9, 25
; GFX11-NEXT: v_alignbit_b32 v0, s4, s8, 31
-; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index a5ea1ee92a048..860fe74408871 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -196,7 +196,7 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s7
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -204,8 +204,8 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_alignbit_b32 v0, s4, v2, v0
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v2, s8
+; VI-NEXT: v_mov_b32_e32 v3, s9
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -259,14 +259,14 @@ define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x3c
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
; GFX11-NEXT: v_mov_b32_e32 v2, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, v0
; GFX11-NEXT: v_alignbit_b32 v0, s4, s6, v2
-; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[8:9]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -294,14 +294,14 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
; VI-LABEL: fshr_v2i32_imm:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s7
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_alignbit_b32 v1, s5, v0, 9
; VI-NEXT: v_alignbit_b32 v0, s4, v2, 7
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -348,12 +348,12 @@ define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_alignbit_b32 v1, s5, s7, 9
; GFX11-NEXT: v_alignbit_b32 v0, s4, s6, 7
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -391,7 +391,7 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
; VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s11
; VI-NEXT: v_mov_b32_e32 v1, s15
@@ -405,8 +405,8 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v4, s12
; VI-NEXT: v_alignbit_b32 v0, s4, v0, v4
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -414,8 +414,8 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x54
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s11
; GFX9-NEXT: v_mov_b32_e32 v1, s15
@@ -429,7 +429,7 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_mov_b32_e32 v5, s12
; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, v5
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: fshr_v4i32:
@@ -474,7 +474,7 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: s_load_b128 s[12:15], s[0:1], 0x54
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v6, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v0, s15 :: v_dual_mov_b32 v1, s14
@@ -485,7 +485,7 @@ define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_alignbit_b32 v1, s5, s9, v4
; GFX11-NEXT: v_alignbit_b32 v0, s4, s8, v5
-; GFX11-NEXT: global_store_b128 v6, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b128 v6, v[0:3], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -517,7 +517,7 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
; VI-LABEL: fshr_v4i32_imm:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s11
; VI-NEXT: v_mov_b32_e32 v1, s10
@@ -526,17 +526,17 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
; VI-NEXT: v_alignbit_b32 v2, s6, v1, 9
; VI-NEXT: v_alignbit_b32 v1, s5, v4, 7
; VI-NEXT: v_mov_b32_e32 v0, s8
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_alignbit_b32 v0, s4, v0, 1
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fshr_v4i32_imm:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s11
; GFX9-NEXT: v_mov_b32_e32 v1, s10
@@ -546,7 +546,7 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
; GFX9-NEXT: v_alignbit_b32 v1, s5, v0, 7
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_alignbit_b32 v0, s4, v0, 1
-; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX9-NEXT: s_endpgm
;
; R600-LABEL: fshr_v4i32_imm:
@@ -583,14 +583,14 @@ define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_alignbit_b32 v3, s7, s11, 1
; GFX11-NEXT: v_alignbit_b32 v2, s6, s10, 9
; GFX11-NEXT: v_alignbit_b32 v1, s5, s9, 7
; GFX11-NEXT: v_alignbit_b32 v0, s4, s8, 1
-; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
index f72d4e0e03633..6de84a6ddea78 100644
--- a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
@@ -58,24 +58,24 @@ define amdgpu_kernel void @fsub_f16(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_sub_f16_e32 v0, v0, v1
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -113,38 +113,38 @@ define amdgpu_kernel void @fsub_f16_imm_a(
;
; GFX89-LABEL: fsub_f16_imm_a:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc
; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: v_sub_f16_e32 v0, 1.0, v0
-; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fsub_f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_sub_f16_e32 v0, 1.0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -180,38 +180,38 @@ define amdgpu_kernel void @fsub_f16_imm_b(
;
; GFX89-LABEL: fsub_f16_imm_b:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0 glc
; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: v_add_f16_e32 v0, -2.0, v0
-; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: fsub_f16_imm_b:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_add_f16_e32 v0, -2.0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -309,23 +309,23 @@ define amdgpu_kernel void @fsub_v2f16(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0
-; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: buffer_load_b32 v1, off, s[8:11], 0
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1]
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -369,60 +369,60 @@ define amdgpu_kernel void @fsub_v2f16_imm_a(
;
; VI-LABEL: fsub_v2f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
; VI-NEXT: v_mov_b32_e32 v1, 0x4000
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_sub_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: v_sub_f16_e32 v0, 1.0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fsub_v2f16_imm_a:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s0, 0x40003c00
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s4, 0x40003c00
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_add_f16 v0, v0, s0 neg_lo:[1,0] neg_hi:[1,0]
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: v_pk_add_f16 v0, v0, s4 neg_lo:[1,0] neg_hi:[1,0]
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: fsub_v2f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v0, 0x40003c00, v0 neg_lo:[0,1] neg_hi:[0,1]
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -464,60 +464,60 @@ define amdgpu_kernel void @fsub_v2f16_imm_b(
;
; VI-LABEL: fsub_v2f16_imm_b:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
; VI-NEXT: v_mov_b32_e32 v1, 0xbc00
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_add_f16_e32 v0, -2.0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fsub_v2f16_imm_b:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s0, 0xbc00c000
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s4, 0xbc00c000
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_add_f16 v0, v0, s0
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: v_pk_add_f16 v0, v0, s4
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: fsub_v2f16_imm_b:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v0, 0xbc00c000, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll b/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll
index 1853aa9303095..6d868e8427fce 100644
--- a/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll
+++ b/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll
@@ -4,14 +4,14 @@
define amdgpu_kernel void @divergent_or3_b32(ptr addrspace(1) %arg) {
; GCN-LABEL: divergent_or3_b32:
; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v3, 4, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1]
+; GCN-NEXT: global_load_dwordx3 v[0:2], v3, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_or3_b32 v0, v1, v0, v2
; GCN-NEXT: v_not_b32_e32 v0, v0
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%i = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -31,17 +31,17 @@ bb:
define amdgpu_kernel void @divergent_or3_b64(ptr addrspace(1) %arg) {
; GCN-LABEL: divergent_or3_b64:
; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v6, 5, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_dwordx2 v[4:5], v6, s[0:1] offset:16
-; GCN-NEXT: global_load_dwordx4 v[0:3], v6, s[0:1]
+; GCN-NEXT: global_load_dwordx2 v[4:5], v6, s[2:3] offset:16
+; GCN-NEXT: global_load_dwordx4 v[0:3], v6, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_or3_b32 v1, v3, v1, v5
; GCN-NEXT: v_or3_b32 v0, v2, v0, v4
; GCN-NEXT: v_not_b32_e32 v1, v1
; GCN-NEXT: v_not_b32_e32 v0, v0
-; GCN-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1]
+; GCN-NEXT: global_store_dwordx2 v6, v[0:1], s[2:3]
; GCN-NEXT: s_endpgm
bb:
%i = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -61,15 +61,15 @@ bb:
define amdgpu_kernel void @divergent_and3_b32(ptr addrspace(1) %arg) {
; GCN-LABEL: divergent_and3_b32:
; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v3, 4, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1]
+; GCN-NEXT: global_load_dwordx3 v[0:2], v3, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, v1, v0
; GCN-NEXT: v_and_b32_e32 v0, v0, v2
; GCN-NEXT: v_not_b32_e32 v0, v0
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%i = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -89,11 +89,11 @@ bb:
define amdgpu_kernel void @divergent_and3_b64(ptr addrspace(1) %arg) {
; GCN-LABEL: divergent_and3_b64:
; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v6, 5, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_dwordx4 v[0:3], v6, s[0:1]
-; GCN-NEXT: global_load_dwordx2 v[4:5], v6, s[0:1] offset:16
+; GCN-NEXT: global_load_dwordx4 v[0:3], v6, s[2:3]
+; GCN-NEXT: global_load_dwordx2 v[4:5], v6, s[2:3] offset:16
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_and_b32_e32 v1, v3, v1
; GCN-NEXT: v_and_b32_e32 v0, v2, v0
@@ -102,7 +102,7 @@ define amdgpu_kernel void @divergent_and3_b64(ptr addrspace(1) %arg) {
; GCN-NEXT: v_and_b32_e32 v0, v0, v4
; GCN-NEXT: v_not_b32_e32 v1, v1
; GCN-NEXT: v_not_b32_e32 v0, v0
-; GCN-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1]
+; GCN-NEXT: global_store_dwordx2 v6, v[0:1], s[2:3]
; GCN-NEXT: s_endpgm
bb:
%i = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -122,14 +122,14 @@ bb:
define amdgpu_kernel void @divergent_xor3_b32(ptr addrspace(1) %arg) {
; GCN-LABEL: divergent_xor3_b32:
; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v3, 4, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_dwordx3 v[0:2], v3, s[0:1]
+; GCN-NEXT: global_load_dwordx3 v[0:2], v3, s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_xor_b32_e32 v0, v1, v0
; GCN-NEXT: v_xnor_b32_e32 v0, v0, v2
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%i = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -149,18 +149,18 @@ bb:
define amdgpu_kernel void @divergent_xor3_b64(ptr addrspace(1) %arg) {
; GCN-LABEL: divergent_xor3_b64:
; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v6, 5, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_dwordx4 v[0:3], v6, s[0:1]
-; GCN-NEXT: global_load_dwordx2 v[4:5], v6, s[0:1] offset:16
+; GCN-NEXT: global_load_dwordx4 v[0:3], v6, s[2:3]
+; GCN-NEXT: global_load_dwordx2 v[4:5], v6, s[2:3] offset:16
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_xor_b32_e32 v1, v3, v1
; GCN-NEXT: v_xor_b32_e32 v0, v2, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_xnor_b32_e32 v1, v1, v5
; GCN-NEXT: v_xnor_b32_e32 v0, v0, v4
-; GCN-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1]
+; GCN-NEXT: global_store_dwordx2 v6, v[0:1], s[2:3]
; GCN-NEXT: s_endpgm
bb:
%i = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll
index 0612383c3f90b..98bb4050e4b30 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp-wrong-subtarget.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=gfx803 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -filetype=obj < %s | llvm-objdump --triple=amdgcn--amdhsa --mcpu=gfx803 -d - | FileCheck -check-prefix=DISASSEMBLY-VI %s
@@ -6,7 +7,7 @@
; FIXME: This will still fail for gfx6/7 and gfx10 subtargets.
; DISASSEMBLY-VI: .long 0xdd348000 // {{[0-9A-Z]+}}: DD348000
-; DISASSEMBLY-VI-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc // {{[0-9A-Z]+}}: 00000100
+; DISASSEMBLY-VI-NEXT: v_cndmask_b32_e32 v2, v0, v0, vcc // {{[0-9A-Z]+}}: 00040100
define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(ptr addrspace(1) %ptr) #0 {
; GCN-LABEL: global_atomic_fadd_noret_f32_wrong_subtarget:
@@ -18,13 +19,13 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(ptr addr
; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GCN-NEXT: s_cbranch_execz .LBB0_2
; GCN-NEXT: ; %bb.1:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s2
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s0
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mul_f32_e32 v1, 4.0, v1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
+; GCN-NEXT: global_atomic_add_f32 v0, v1, s[4:5]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_wbinvl1_vol
; GCN-NEXT: .LBB0_2:
diff --git a/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll
index b8ecbae3d3114..d3dc660e99bed 100644
--- a/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-i16-load-store.ll
@@ -135,10 +135,10 @@ define amdgpu_kernel void @half4(ptr addrspace(1) nocapture readonly %0, ptr add
; GFX908-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX908-NEXT: v_mov_b32_e32 v2, 0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX908-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v0, s0
-; GFX908-NEXT: v_mov_b32_e32 v1, s1
+; GFX908-NEXT: v_mov_b32_e32 v0, s4
+; GFX908-NEXT: v_mov_b32_e32 v1, s5
; GFX908-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX908-NEXT: s_endpgm
;
@@ -147,9 +147,9 @@ define amdgpu_kernel void @half4(ptr addrspace(1) nocapture readonly %0, ptr add
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX90A-NEXT: s_endpgm
;
@@ -158,10 +158,10 @@ define amdgpu_kernel void @half4(ptr addrspace(1) nocapture readonly %0, ptr add
; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX1030-NEXT: v_mov_b32_e32 v2, 0
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT: v_mov_b32_e32 v0, s0
-; GFX1030-NEXT: v_mov_b32_e32 v1, s1
+; GFX1030-NEXT: v_mov_b32_e32 v0, s4
+; GFX1030-NEXT: v_mov_b32_e32 v1, s5
; GFX1030-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX1030-NEXT: s_endpgm
%gep0 = getelementptr half, ptr addrspace(1) %0, i64 0
diff --git a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll
index f709eae990bda..41327f92d90df 100644
--- a/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll
@@ -13,15 +13,15 @@
define amdgpu_kernel void @test_move_load_address_to_vgpr(ptr addrspace(1) nocapture %arg) {
; GCN-LABEL: test_move_load_address_to_vgpr:
; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_dword v0, v1, s[0:1] glc
+; GCN-NEXT: global_load_dword v0, v1, s[2:3] glc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v3, s1
+; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: v_add_u32_e32 v2, 0xffffff00, v0
; GCN-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
-; GCN-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
+; GCN-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
; GCN-NEXT: .LBB0_1: ; %bb3
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -54,21 +54,21 @@ bb3: ; preds = %bb3, %bb
define amdgpu_kernel void @test_move_load_address_to_vgpr_d16_hi(ptr addrspace(1) nocapture %arg) {
; GCN-LABEL: test_move_load_address_to_vgpr_d16_hi:
; GCN: ; %bb.0: ; %bb
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_movk_i32 s0, 0x100
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_ushort v0, v1, s[0:1] glc
+; GCN-NEXT: global_load_ushort v0, v1, s[2:3] glc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v2, s1
-; GCN-NEXT: s_movk_i32 s1, 0x100
+; GCN-NEXT: v_mov_b32_e32 v2, s3
; GCN-NEXT: .LBB1_1: ; %bb3
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: v_lshlrev_b64 v[3:4], 1, v[0:1]
-; GCN-NEXT: v_add_co_u32_e32 v3, vcc, s0, v3
+; GCN-NEXT: v_add_co_u32_e32 v3, vcc, s2, v3
; GCN-NEXT: v_addc_co_u32_e32 v4, vcc, v2, v4, vcc
; GCN-NEXT: global_load_short_d16_hi v0, v[3:4], off glc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s1, v0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
; GCN-NEXT: s_cbranch_vccz .LBB1_1
; GCN-NEXT: ; %bb.2: ; %bb2
; GCN-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
index 4d585cf56b9a1..7653cd02c8f28 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
@@ -19,13 +19,13 @@ define amdgpu_kernel void @atomic_add_i32_offset(ptr addrspace(1) %out, i32 %in)
;
; VI-LABEL: atomic_add_i32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_add v0, off, s[0:3], 0 offset:16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_add v0, off, s[4:7], 0 offset:16
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -112,14 +112,14 @@ define amdgpu_kernel void @atomic_add_i32_soffset(ptr addrspace(1) %out, i32 %in
;
; VI-LABEL: atomic_add_i32_soffset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: s_mov_b32 s5, 0x8ca0
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s0, 0x8ca0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_add v0, off, s[0:3], s5
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_add v0, off, s[4:7], s0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -394,13 +394,13 @@ define amdgpu_kernel void @atomic_add_i32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: atomic_add_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_add v0, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_add v0, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -616,13 +616,13 @@ define amdgpu_kernel void @atomic_and_i32_offset(ptr addrspace(1) %out, i32 %in)
;
; VI-LABEL: atomic_and_i32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_and v0, off, s[0:3], 0 offset:16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 offset:16
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -846,13 +846,13 @@ define amdgpu_kernel void @atomic_and_i32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: atomic_and_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_and v0, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_and v0, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -1068,13 +1068,13 @@ define amdgpu_kernel void @atomic_sub_i32_offset(ptr addrspace(1) %out, i32 %in)
;
; VI-LABEL: atomic_sub_i32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_sub v0, off, s[0:3], 0 offset:16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 offset:16
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -1298,13 +1298,13 @@ define amdgpu_kernel void @atomic_sub_i32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: atomic_sub_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_sub v0, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -1520,13 +1520,13 @@ define amdgpu_kernel void @atomic_max_i32_offset(ptr addrspace(1) %out, i32 %in)
;
; VI-LABEL: atomic_max_i32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_smax v0, off, s[0:3], 0 offset:16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_smax v0, off, s[4:7], 0 offset:16
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -1736,13 +1736,13 @@ define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: atomic_max_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_smax v0, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_smax v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_max_i32:
@@ -1940,13 +1940,13 @@ define amdgpu_kernel void @atomic_umax_i32_offset(ptr addrspace(1) %out, i32 %in
;
; VI-LABEL: atomic_umax_i32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_umax v0, off, s[0:3], 0 offset:16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_umax v0, off, s[4:7], 0 offset:16
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umax_i32_offset:
@@ -2152,13 +2152,13 @@ define amdgpu_kernel void @atomic_umax_i32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: atomic_umax_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_umax v0, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_umax v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umax_i32:
@@ -2356,13 +2356,13 @@ define amdgpu_kernel void @atomic_min_i32_offset(ptr addrspace(1) %out, i32 %in)
;
; VI-LABEL: atomic_min_i32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_smin v0, off, s[0:3], 0 offset:16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_smin v0, off, s[4:7], 0 offset:16
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_min_i32_offset:
@@ -2568,13 +2568,13 @@ define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: atomic_min_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_smin v0, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_smin v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_min_i32:
@@ -2772,13 +2772,13 @@ define amdgpu_kernel void @atomic_umin_i32_offset(ptr addrspace(1) %out, i32 %in
;
; VI-LABEL: atomic_umin_i32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_umin v0, off, s[0:3], 0 offset:16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_umin v0, off, s[4:7], 0 offset:16
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umin_i32_offset:
@@ -2984,13 +2984,13 @@ define amdgpu_kernel void @atomic_umin_i32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: atomic_umin_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_umin v0, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_umin v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umin_i32:
@@ -3190,13 +3190,13 @@ define amdgpu_kernel void @atomic_or_i32_offset(ptr addrspace(1) %out, i32 %in)
;
; VI-LABEL: atomic_or_i32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_or v0, off, s[0:3], 0 offset:16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 offset:16
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -3420,13 +3420,13 @@ define amdgpu_kernel void @atomic_or_i32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: atomic_or_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_or v0, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_or v0, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -3642,13 +3642,13 @@ define amdgpu_kernel void @atomic_xchg_i32_offset(ptr addrspace(1) %out, i32 %in
;
; VI-LABEL: atomic_xchg_i32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 offset:16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 offset:16
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -3686,13 +3686,13 @@ define amdgpu_kernel void @atomic_xchg_f32_offset(ptr addrspace(1) %out, float %
;
; VI-LABEL: atomic_xchg_f32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 offset:16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 offset:16
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -3916,13 +3916,13 @@ define amdgpu_kernel void @atomic_xchg_i32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: atomic_xchg_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_swap v0, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -4632,13 +4632,13 @@ define amdgpu_kernel void @atomic_xor_i32_offset(ptr addrspace(1) %out, i32 %in)
;
; VI-LABEL: atomic_xor_i32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_xor v0, off, s[0:3], 0 offset:16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 offset:16
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -4862,13 +4862,13 @@ define amdgpu_kernel void @atomic_xor_i32(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: atomic_xor_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_xor v0, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -5087,31 +5087,31 @@ define amdgpu_kernel void @atomic_load_i32_offset(ptr addrspace(1) %in, ptr addr
;
; VI-LABEL: atomic_load_i32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s0, s0, 16
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_add_u32 s0, s4, 16
+; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i32_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc
+; GFX9-NEXT: global_load_dword v1, v0, s[4:5] offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %in, i64 4
@@ -5141,31 +5141,31 @@ define amdgpu_kernel void @atomic_load_i32_negoffset(ptr addrspace(1) %in, ptr a
;
; VI-LABEL: atomic_load_i32_negoffset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s0, s0, 0xfffffe00
-; VI-NEXT: s_addc_u32 s1, s1, -1
+; VI-NEXT: s_add_u32 s0, s4, 0xfffffe00
+; VI-NEXT: s_addc_u32 s1, s5, -1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i32_negoffset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:-512 glc
+; GFX9-NEXT: global_load_dword v1, v0, s[4:5] offset:-512 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %in, i64 -128
@@ -5193,31 +5193,31 @@ define amdgpu_kernel void @atomic_load_f32_offset(ptr addrspace(1) %in, ptr addr
;
; VI-LABEL: atomic_load_f32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s0, s0, 16
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_add_u32 s0, s4, 16
+; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_f32_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc
+; GFX9-NEXT: global_load_dword v1, v0, s[4:5] offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%gep = getelementptr float, ptr addrspace(1) %in, i64 4
@@ -5245,29 +5245,29 @@ define amdgpu_kernel void @atomic_load_i32(ptr addrspace(1) %in, ptr addrspace(1
;
; VI-LABEL: atomic_load_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i32:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%val = load atomic i32, ptr addrspace(1) %in syncscope("agent") seq_cst, align 4
@@ -5298,14 +5298,12 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr addrspace(1) %in, p
;
; VI-LABEL: atomic_load_i32_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 16
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -5313,9 +5311,11 @@ define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr addrspace(1) %in, p
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i32_addr64_offset:
@@ -5363,22 +5363,22 @@ define amdgpu_kernel void @atomic_load_i32_addr64(ptr addrspace(1) %in, ptr addr
;
; VI-LABEL: atomic_load_i32_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i32_addr64:
@@ -5425,14 +5425,12 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr addrspace(1) %in, p
;
; VI-LABEL: atomic_load_f32_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 16
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -5440,9 +5438,11 @@ define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr addrspace(1) %in, p
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_f32_addr64_offset:
@@ -5796,29 +5796,29 @@ define amdgpu_kernel void @atomic_load_i8_offset(ptr addrspace(1) %in, ptr addrs
;
; VI-LABEL: atomic_load_i8_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: s_mov_b32 s2, s6
-; VI-NEXT: s_mov_b32 s3, s7
-; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:16 glc
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 offset:16 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i8_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] offset:16 glc
+; GFX9-NEXT: global_load_ubyte v1, v0, s[4:5] offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
+; GFX9-NEXT: global_store_byte v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%gep = getelementptr i8, ptr addrspace(1) %in, i64 16
@@ -5848,31 +5848,31 @@ define amdgpu_kernel void @atomic_load_i8_negoffset(ptr addrspace(1) %in, ptr ad
;
; VI-LABEL: atomic_load_i8_negoffset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s0, s0, 0xfffffe00
-; VI-NEXT: s_addc_u32 s1, s1, -1
+; VI-NEXT: s_add_u32 s0, s4, 0xfffffe00
+; VI-NEXT: s_addc_u32 s1, s5, -1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_ubyte v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i8_negoffset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] offset:-512 glc
+; GFX9-NEXT: global_load_ubyte v1, v0, s[4:5] offset:-512 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
+; GFX9-NEXT: global_store_byte v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%gep = getelementptr i8, ptr addrspace(1) %in, i64 -512
@@ -5977,29 +5977,29 @@ define amdgpu_kernel void @atomic_load_i16_offset(ptr addrspace(1) %in, ptr addr
;
; VI-LABEL: atomic_load_i16_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: s_mov_b32 s2, s6
-; VI-NEXT: s_mov_b32 s3, s7
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:16 glc
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 offset:16 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i16_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:16 glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_short v0, v1, s[2:3]
+; GFX9-NEXT: global_store_short v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%gep = getelementptr i16, ptr addrspace(1) %in, i64 8
@@ -6029,31 +6029,31 @@ define amdgpu_kernel void @atomic_load_i16_negoffset(ptr addrspace(1) %in, ptr a
;
; VI-LABEL: atomic_load_i16_negoffset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s0, s0, 0xfffffe00
-; VI-NEXT: s_addc_u32 s1, s1, -1
+; VI-NEXT: s_add_u32 s0, s4, 0xfffffe00
+; VI-NEXT: s_addc_u32 s1, s5, -1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_ushort v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i16_negoffset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:-512 glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:-512 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_short v0, v1, s[2:3]
+; GFX9-NEXT: global_store_short v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%gep = getelementptr i16, ptr addrspace(1) %in, i64 -256
@@ -6307,13 +6307,13 @@ define amdgpu_kernel void @atomic_inc_i32_offset(ptr addrspace(1) %out, i32 %in)
;
; VI-LABEL: atomic_inc_i32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_inc v0, off, s[0:3], 0 offset:16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0 offset:16
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -6400,14 +6400,14 @@ define amdgpu_kernel void @atomic_inc_i32_soffset(ptr addrspace(1) %out, i32 %in
;
; VI-LABEL: atomic_inc_i32_soffset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: s_mov_b32 s5, 0x8ca0
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s0, 0x8ca0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_inc v0, off, s[0:3], s5
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_inc v0, off, s[4:7], s0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -6681,13 +6681,13 @@ define amdgpu_kernel void @atomic_dec_i32_offset(ptr addrspace(1) %out, i32 %in)
;
; VI-LABEL: atomic_dec_i32_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_dec v0, off, s[0:3], 0 offset:16
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0 offset:16
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -6774,14 +6774,14 @@ define amdgpu_kernel void @atomic_dec_i32_soffset(ptr addrspace(1) %out, i32 %in
;
; VI-LABEL: atomic_dec_i32_soffset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: s_mov_b32 s5, 0x8ca0
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s0, 0x8ca0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_atomic_dec v0, off, s[0:3], s5
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_atomic_dec v0, off, s[4:7], s0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
@@ -7058,29 +7058,29 @@ define amdgpu_kernel void @atomic_load_f16_offset(ptr addrspace(1) %in, ptr addr
;
; VI-LABEL: atomic_load_f16_offset:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: s_mov_b32 s2, s6
-; VI-NEXT: s_mov_b32 s3, s7
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:16 glc
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 offset:16 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_f16_offset:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:16 glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_short v0, v1, s[2:3]
+; GFX9-NEXT: global_store_short v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
%gep = getelementptr half, ptr addrspace(1) %in, i64 8
%val = load atomic half, ptr addrspace(1) %gep seq_cst, align 2
@@ -7109,31 +7109,31 @@ define amdgpu_kernel void @atomic_load_f16_negoffset(ptr addrspace(1) %in, ptr a
;
; VI-LABEL: atomic_load_f16_negoffset:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s0, s0, 0xfffffe00
-; VI-NEXT: s_addc_u32 s1, s1, -1
+; VI-NEXT: s_add_u32 s0, s4, 0xfffffe00
+; VI-NEXT: s_addc_u32 s1, s5, -1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_ushort v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_f16_negoffset:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:-512 glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:-512 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_short v0, v1, s[2:3]
+; GFX9-NEXT: global_store_short v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
%gep = getelementptr half, ptr addrspace(1) %in, i64 -256
%val = load atomic half, ptr addrspace(1) %gep seq_cst, align 2
@@ -7160,29 +7160,29 @@ define amdgpu_kernel void @atomic_load_bf16_offset(ptr addrspace(1) %in, ptr add
;
; VI-LABEL: atomic_load_bf16_offset:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: s_mov_b32 s2, s6
-; VI-NEXT: s_mov_b32 s3, s7
-; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:16 glc
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 offset:16 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_bf16_offset:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:16 glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:16 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_short v0, v1, s[2:3]
+; GFX9-NEXT: global_store_short v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
%gep = getelementptr bfloat, ptr addrspace(1) %in, i64 8
%val = load atomic bfloat, ptr addrspace(1) %gep seq_cst, align 2
@@ -7211,31 +7211,31 @@ define amdgpu_kernel void @atomic_load_bf16_negoffset(ptr addrspace(1) %in, ptr
;
; VI-LABEL: atomic_load_bf16_negoffset:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s0, s0, 0xfffffe00
-; VI-NEXT: s_addc_u32 s1, s1, -1
+; VI-NEXT: s_add_u32 s0, s4, 0xfffffe00
+; VI-NEXT: s_addc_u32 s1, s5, -1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_ushort v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_bf16_negoffset:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:-512 glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[4:5] offset:-512 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_short v0, v1, s[2:3]
+; GFX9-NEXT: global_store_short v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
%gep = getelementptr bfloat, ptr addrspace(1) %in, i64 -256
%val = load atomic bfloat, ptr addrspace(1) %gep seq_cst, align 2
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
index 3050da034d236..b8031c6bbcc98 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
@@ -4753,26 +4753,26 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou
;
; VI-LABEL: atomic_max_i32_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ashr_i32 s7, s5, 31
-; VI-NEXT: s_mov_b32 s6, s5
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
-; VI-NEXT: s_add_u32 s6, s0, s6
-; VI-NEXT: s_addc_u32 s7, s1, s7
-; VI-NEXT: s_load_dword s5, s[6:7], 0x10
-; VI-NEXT: s_add_u32 s6, s6, 16
-; VI-NEXT: s_addc_u32 s7, s7, 0
-; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: s_ashr_i32 s1, s3, 31
+; VI-NEXT: s_mov_b32 s0, s3
+; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; VI-NEXT: s_add_u32 s4, s4, s0
+; VI-NEXT: s_addc_u32 s5, s5, s1
+; VI-NEXT: s_load_dword s3, s[4:5], 0x10
+; VI-NEXT: s_add_u32 s4, s4, 16
+; VI-NEXT: s_addc_u32 s5, s5, 0
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s5
-; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: .LBB92_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v3, v2
-; VI-NEXT: v_max_i32_e32 v2, s4, v3
+; VI-NEXT: v_max_i32_e32 v2, s2, v3
; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -4782,8 +4782,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %ou
; VI-NEXT: s_cbranch_execnz .LBB92_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_or_b64 exec, exec, s[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -4963,24 +4963,24 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_max_i32_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ashr_i32 s7, s5, 31
-; VI-NEXT: s_mov_b32 s6, s5
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
-; VI-NEXT: s_add_u32 s6, s0, s6
-; VI-NEXT: s_addc_u32 s7, s1, s7
-; VI-NEXT: s_load_dword s5, s[6:7], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: s_ashr_i32 s1, s3, 31
+; VI-NEXT: s_mov_b32 s0, s3
+; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; VI-NEXT: s_add_u32 s4, s4, s0
+; VI-NEXT: s_addc_u32 s5, s5, s1
+; VI-NEXT: s_load_dword s3, s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_mov_b64 s[0:1], 0
-; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: .LBB94_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v3, v2
-; VI-NEXT: v_max_i32_e32 v2, s4, v3
+; VI-NEXT: v_max_i32_e32 v2, s2, v3
; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -4990,8 +4990,8 @@ define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr
; VI-NEXT: s_cbranch_execnz .LBB94_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_or_b64 exec, exec, s[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -6006,26 +6006,26 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o
;
; VI-LABEL: atomic_umax_i32_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ashr_i32 s7, s5, 31
-; VI-NEXT: s_mov_b32 s6, s5
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
-; VI-NEXT: s_add_u32 s6, s0, s6
-; VI-NEXT: s_addc_u32 s7, s1, s7
-; VI-NEXT: s_load_dword s5, s[6:7], 0x10
-; VI-NEXT: s_add_u32 s6, s6, 16
-; VI-NEXT: s_addc_u32 s7, s7, 0
-; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: s_ashr_i32 s1, s3, 31
+; VI-NEXT: s_mov_b32 s0, s3
+; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; VI-NEXT: s_add_u32 s4, s4, s0
+; VI-NEXT: s_addc_u32 s5, s5, s1
+; VI-NEXT: s_load_dword s3, s[4:5], 0x10
+; VI-NEXT: s_add_u32 s4, s4, 16
+; VI-NEXT: s_addc_u32 s5, s5, 0
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s5
-; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: .LBB106_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v3, v2
-; VI-NEXT: v_max_u32_e32 v2, s4, v3
+; VI-NEXT: v_max_u32_e32 v2, s2, v3
; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -6035,8 +6035,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %o
; VI-NEXT: s_cbranch_execnz .LBB106_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_or_b64 exec, exec, s[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -6121,24 +6121,24 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_umax_i32_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ashr_i32 s7, s5, 31
-; VI-NEXT: s_mov_b32 s6, s5
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
-; VI-NEXT: s_add_u32 s6, s0, s6
-; VI-NEXT: s_addc_u32 s7, s1, s7
-; VI-NEXT: s_load_dword s5, s[6:7], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: s_ashr_i32 s1, s3, 31
+; VI-NEXT: s_mov_b32 s0, s3
+; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; VI-NEXT: s_add_u32 s4, s4, s0
+; VI-NEXT: s_addc_u32 s5, s5, s1
+; VI-NEXT: s_load_dword s3, s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_mov_b64 s[0:1], 0
-; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: .LBB107_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v3, v2
-; VI-NEXT: v_max_u32_e32 v2, s4, v3
+; VI-NEXT: v_max_u32_e32 v2, s2, v3
; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -6148,8 +6148,8 @@ define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr
; VI-NEXT: s_cbranch_execnz .LBB107_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_or_b64 exec, exec, s[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -7997,26 +7997,26 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou
;
; VI-LABEL: atomic_min_i32_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ashr_i32 s7, s5, 31
-; VI-NEXT: s_mov_b32 s6, s5
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
-; VI-NEXT: s_add_u32 s6, s0, s6
-; VI-NEXT: s_addc_u32 s7, s1, s7
-; VI-NEXT: s_load_dword s5, s[6:7], 0x10
-; VI-NEXT: s_add_u32 s6, s6, 16
-; VI-NEXT: s_addc_u32 s7, s7, 0
-; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: s_ashr_i32 s1, s3, 31
+; VI-NEXT: s_mov_b32 s0, s3
+; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; VI-NEXT: s_add_u32 s4, s4, s0
+; VI-NEXT: s_addc_u32 s5, s5, s1
+; VI-NEXT: s_load_dword s3, s[4:5], 0x10
+; VI-NEXT: s_add_u32 s4, s4, 16
+; VI-NEXT: s_addc_u32 s5, s5, 0
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s5
-; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: .LBB129_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v3, v2
-; VI-NEXT: v_min_i32_e32 v2, s4, v3
+; VI-NEXT: v_min_i32_e32 v2, s2, v3
; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -8026,8 +8026,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %ou
; VI-NEXT: s_cbranch_execnz .LBB129_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_or_b64 exec, exec, s[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -8194,24 +8194,24 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_min_i32_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ashr_i32 s7, s5, 31
-; VI-NEXT: s_mov_b32 s6, s5
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
-; VI-NEXT: s_add_u32 s6, s0, s6
-; VI-NEXT: s_addc_u32 s7, s1, s7
-; VI-NEXT: s_load_dword s5, s[6:7], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: s_ashr_i32 s1, s3, 31
+; VI-NEXT: s_mov_b32 s0, s3
+; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
+; VI-NEXT: s_add_u32 s4, s4, s0
+; VI-NEXT: s_addc_u32 s5, s5, s1
+; VI-NEXT: s_load_dword s3, s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_mov_b64 s[0:1], 0
-; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: .LBB131_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v3, v2
-; VI-NEXT: v_min_i32_e32 v2, s4, v3
+; VI-NEXT: v_min_i32_e32 v2, s2, v3
; VI-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -8221,8 +8221,8 @@ define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr
; VI-NEXT: s_cbranch_execnz .LBB131_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_or_b64 exec, exec, s[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
index f5dbaaff9cf88..a6c8f661e920a 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
@@ -20,36 +20,36 @@ define amdgpu_kernel void @atomic_add_i64_offset(ptr addrspace(1) %out, i64 %in)
;
; VI-LABEL: atomic_add_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_add_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_add_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_add_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_add_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -115,15 +115,15 @@ define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-LABEL: atomic_add_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -155,10 +155,10 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr addrspace(1) %out, i
; VI-LABEL: atomic_add_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -191,11 +191,11 @@ define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr addrspace(1) %out, i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1] offset:32
@@ -233,56 +233,56 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %ou
;
; VI-LABEL: atomic_add_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_add_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_add_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_add_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -312,38 +312,38 @@ define amdgpu_kernel void @atomic_add_i64(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: atomic_add_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_add_i64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_add_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_add_x2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_add_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -408,15 +408,15 @@ define amdgpu_kernel void @atomic_add_i64_ret(ptr addrspace(1) %out, ptr addrspa
; GFX12-LABEL: atomic_add_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[4:5] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -447,10 +447,10 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr addrspace(1) %out, i64 %in,
; VI-LABEL: atomic_add_i64_addr64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -481,11 +481,11 @@ define amdgpu_kernel void @atomic_add_i64_addr64(ptr addrspace(1) %out, i64 %in,
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_add_u64 v2, v[0:1], s[0:1]
@@ -522,54 +522,54 @@ define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_add_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_add_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_add_x2 v[0:1], v2, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_add_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -596,36 +596,36 @@ define amdgpu_kernel void @atomic_and_i64_offset(ptr addrspace(1) %out, i64 %in)
;
; VI-LABEL: atomic_and_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_and_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_and_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -691,15 +691,15 @@ define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-LABEL: atomic_and_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -731,10 +731,10 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr addrspace(1) %out, i
; VI-LABEL: atomic_and_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -767,11 +767,11 @@ define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr addrspace(1) %out, i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1] offset:32
@@ -809,56 +809,56 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %ou
;
; VI-LABEL: atomic_and_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_and_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_and_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -888,38 +888,38 @@ define amdgpu_kernel void @atomic_and_i64(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: atomic_and_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_and_i64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_and_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -984,15 +984,15 @@ define amdgpu_kernel void @atomic_and_i64_ret(ptr addrspace(1) %out, ptr addrspa
; GFX12-LABEL: atomic_and_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[4:5] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1023,10 +1023,10 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr addrspace(1) %out, i64 %in,
; VI-LABEL: atomic_and_i64_addr64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -1057,11 +1057,11 @@ define amdgpu_kernel void @atomic_and_i64_addr64(ptr addrspace(1) %out, i64 %in,
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_and_b64 v2, v[0:1], s[0:1]
@@ -1098,54 +1098,54 @@ define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_and_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_and_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_and_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_and_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1172,36 +1172,36 @@ define amdgpu_kernel void @atomic_sub_i64_offset(ptr addrspace(1) %out, i64 %in)
;
; VI-LABEL: atomic_sub_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_sub_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_sub_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -1267,15 +1267,15 @@ define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-LABEL: atomic_sub_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1307,10 +1307,10 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr addrspace(1) %out, i
; VI-LABEL: atomic_sub_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -1343,11 +1343,11 @@ define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr addrspace(1) %out, i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1] offset:32
@@ -1385,56 +1385,56 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %ou
;
; VI-LABEL: atomic_sub_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_sub_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_sub_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1464,38 +1464,38 @@ define amdgpu_kernel void @atomic_sub_i64(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: atomic_sub_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_sub_i64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_sub_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -1560,15 +1560,15 @@ define amdgpu_kernel void @atomic_sub_i64_ret(ptr addrspace(1) %out, ptr addrspa
; GFX12-LABEL: atomic_sub_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[4:5] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1599,10 +1599,10 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr addrspace(1) %out, i64 %in,
; VI-LABEL: atomic_sub_i64_addr64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -1633,11 +1633,11 @@ define amdgpu_kernel void @atomic_sub_i64_addr64(ptr addrspace(1) %out, i64 %in,
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_sub_u64 v2, v[0:1], s[0:1]
@@ -1674,54 +1674,54 @@ define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_sub_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_sub_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_sub_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_sub_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1746,32 +1746,32 @@ define amdgpu_kernel void @atomic_max_i64_offset(ptr addrspace(1) %out, i64 %in)
;
; VI-LABEL: atomic_max_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_smax_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_smax_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_max_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_smax_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_smax_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_max_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
@@ -1834,15 +1834,15 @@ define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-LABEL: atomic_max_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1872,10 +1872,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i
; VI-LABEL: atomic_max_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -1904,11 +1904,11 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1] offset:32
@@ -1945,54 +1945,54 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou
;
; VI-LABEL: atomic_max_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_max_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_max_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2020,34 +2020,34 @@ define amdgpu_kernel void @atomic_max_i64(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: atomic_max_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: buffer_atomic_smax_x2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: buffer_atomic_smax_x2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_max_i64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_smax_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_smax_x2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_max_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
@@ -2109,15 +2109,15 @@ define amdgpu_kernel void @atomic_max_i64_ret(ptr addrspace(1) %out, ptr addrspa
; GFX12-LABEL: atomic_max_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[4:5] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2146,10 +2146,10 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in,
; VI-LABEL: atomic_max_i64_addr64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -2176,11 +2176,11 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in,
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_max_i64 v2, v[0:1], s[0:1]
@@ -2216,52 +2216,52 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_max_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_max_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v2, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_max_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_max_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2286,32 +2286,32 @@ define amdgpu_kernel void @atomic_umax_i64_offset(ptr addrspace(1) %out, i64 %in
;
; VI-LABEL: atomic_umax_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_umax_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_umax_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umax_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_umax_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_umax_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umax_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
@@ -2374,15 +2374,15 @@ define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-LABEL: atomic_umax_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2412,10 +2412,10 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out,
; VI-LABEL: atomic_umax_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -2444,11 +2444,11 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out,
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1] offset:32
@@ -2485,54 +2485,54 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o
;
; VI-LABEL: atomic_umax_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umax_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2560,34 +2560,34 @@ define amdgpu_kernel void @atomic_umax_i64(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: atomic_umax_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: buffer_atomic_umax_x2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: buffer_atomic_umax_x2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umax_i64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_umax_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_umax_x2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umax_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
@@ -2649,15 +2649,15 @@ define amdgpu_kernel void @atomic_umax_i64_ret(ptr addrspace(1) %out, ptr addrsp
; GFX12-LABEL: atomic_umax_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[4:5] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2686,10 +2686,10 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr addrspace(1) %out, i64 %in
; VI-LABEL: atomic_umax_i64_addr64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -2716,11 +2716,11 @@ define amdgpu_kernel void @atomic_umax_i64_addr64(ptr addrspace(1) %out, i64 %in
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_max_u64 v2, v[0:1], s[0:1]
@@ -2756,52 +2756,52 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_umax_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umax_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v2, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umax_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_max_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2826,32 +2826,32 @@ define amdgpu_kernel void @atomic_min_i64_offset(ptr addrspace(1) %out, i64 %in)
;
; VI-LABEL: atomic_min_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_smin_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_smin_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_min_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_smin_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_smin_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_min_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
@@ -2914,15 +2914,15 @@ define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-LABEL: atomic_min_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2952,10 +2952,10 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i
; VI-LABEL: atomic_min_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -2984,11 +2984,11 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1] offset:32
@@ -3025,54 +3025,54 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou
;
; VI-LABEL: atomic_min_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_min_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_min_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3100,34 +3100,34 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: atomic_min_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: buffer_atomic_smin_x2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: buffer_atomic_smin_x2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_min_i64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_smin_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_smin_x2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_min_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
@@ -3189,15 +3189,15 @@ define amdgpu_kernel void @atomic_min_i64_ret(ptr addrspace(1) %out, ptr addrspa
; GFX12-LABEL: atomic_min_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[4:5] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3226,10 +3226,10 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr addrspace(1) %out, i64 %in,
; VI-LABEL: atomic_min_i64_addr64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -3256,11 +3256,11 @@ define amdgpu_kernel void @atomic_min_i64_addr64(ptr addrspace(1) %out, i64 %in,
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_min_i64 v2, v[0:1], s[0:1]
@@ -3296,52 +3296,52 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_min_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_min_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v2, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_min_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_min_i64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3366,32 +3366,32 @@ define amdgpu_kernel void @atomic_umin_i64_offset(ptr addrspace(1) %out, i64 %in
;
; VI-LABEL: atomic_umin_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_umin_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_umin_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umin_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_umin_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_umin_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umin_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
@@ -3454,15 +3454,15 @@ define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-LABEL: atomic_umin_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3492,10 +3492,10 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr addrspace(1) %out,
; VI-LABEL: atomic_umin_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -3524,11 +3524,11 @@ define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr addrspace(1) %out,
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1] offset:32
@@ -3565,54 +3565,54 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %o
;
; VI-LABEL: atomic_umin_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umin_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umin_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3640,34 +3640,34 @@ define amdgpu_kernel void @atomic_umin_i64(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: atomic_umin_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: buffer_atomic_umin_x2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: buffer_atomic_umin_x2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umin_i64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_umin_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_umin_x2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umin_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: s_endpgm
@@ -3729,15 +3729,15 @@ define amdgpu_kernel void @atomic_umin_i64_ret(ptr addrspace(1) %out, ptr addrsp
; GFX12-LABEL: atomic_umin_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[4:5] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3766,10 +3766,10 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr addrspace(1) %out, i64 %in
; VI-LABEL: atomic_umin_i64_addr64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -3796,11 +3796,11 @@ define amdgpu_kernel void @atomic_umin_i64_addr64(ptr addrspace(1) %out, i64 %in
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_min_u64 v2, v[0:1], s[0:1]
@@ -3836,52 +3836,52 @@ define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_umin_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umin_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v2, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_umin_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_min_u64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SE
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3908,36 +3908,36 @@ define amdgpu_kernel void @atomic_or_i64_offset(ptr addrspace(1) %out, i64 %in)
;
; VI-LABEL: atomic_or_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_or_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_or_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -4003,15 +4003,15 @@ define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr addrspace(1) %out, ptr a
; GFX12-LABEL: atomic_or_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4043,10 +4043,10 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr addrspace(1) %out, i6
; VI-LABEL: atomic_or_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -4079,11 +4079,11 @@ define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr addrspace(1) %out, i6
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1] offset:32
@@ -4121,56 +4121,56 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out
;
; VI-LABEL: atomic_or_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_or_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_or_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4200,38 +4200,38 @@ define amdgpu_kernel void @atomic_or_i64(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: atomic_or_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_or_i64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_or_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -4296,15 +4296,15 @@ define amdgpu_kernel void @atomic_or_i64_ret(ptr addrspace(1) %out, ptr addrspac
; GFX12-LABEL: atomic_or_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[4:5] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4335,10 +4335,10 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr addrspace(1) %out, i64 %in,
; VI-LABEL: atomic_or_i64_addr64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -4369,11 +4369,11 @@ define amdgpu_kernel void @atomic_or_i64_addr64(ptr addrspace(1) %out, i64 %in,
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_or_b64 v2, v[0:1], s[0:1]
@@ -4410,54 +4410,54 @@ define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: atomic_or_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_or_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_or_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_or_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4484,36 +4484,36 @@ define amdgpu_kernel void @atomic_xchg_i64_offset(ptr addrspace(1) %out, i64 %in
;
; VI-LABEL: atomic_xchg_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_xchg_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xchg_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -4539,36 +4539,36 @@ define amdgpu_kernel void @atomic_xchg_f64_offset(ptr addrspace(1) %out, double
;
; VI-LABEL: atomic_xchg_f64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_xchg_f64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xchg_f64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -4594,36 +4594,36 @@ define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_xchg_pointer_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_xchg_pointer_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xchg_pointer_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -4689,15 +4689,15 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-LABEL: atomic_xchg_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4729,10 +4729,10 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr addrspace(1) %out,
; VI-LABEL: atomic_xchg_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -4765,11 +4765,11 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr addrspace(1) %out,
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1] offset:32
@@ -4807,56 +4807,56 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %o
;
; VI-LABEL: atomic_xchg_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_xchg_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xchg_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4886,38 +4886,38 @@ define amdgpu_kernel void @atomic_xchg_i64(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: atomic_xchg_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_xchg_i64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xchg_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -4982,15 +4982,15 @@ define amdgpu_kernel void @atomic_xchg_i64_ret(ptr addrspace(1) %out, ptr addrsp
; GFX12-LABEL: atomic_xchg_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[4:5] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5021,10 +5021,10 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr addrspace(1) %out, i64 %in
; VI-LABEL: atomic_xchg_i64_addr64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -5055,11 +5055,11 @@ define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr addrspace(1) %out, i64 %in
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_swap_b64 v2, v[0:1], s[0:1]
@@ -5096,54 +5096,54 @@ define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_xchg_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_xchg_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xchg_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_swap_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5170,36 +5170,36 @@ define amdgpu_kernel void @atomic_xor_i64_offset(ptr addrspace(1) %out, i64 %in)
;
; VI-LABEL: atomic_xor_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_xor_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xor_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -5265,15 +5265,15 @@ define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-LABEL: atomic_xor_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5305,10 +5305,10 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr addrspace(1) %out, i
; VI-LABEL: atomic_xor_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -5341,11 +5341,11 @@ define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr addrspace(1) %out, i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1] offset:32
@@ -5383,56 +5383,56 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %ou
;
; VI-LABEL: atomic_xor_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_xor_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xor_i64_ret_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5462,38 +5462,38 @@ define amdgpu_kernel void @atomic_xor_i64(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: atomic_xor_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_xor_i64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xor_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -5558,15 +5558,15 @@ define amdgpu_kernel void @atomic_xor_i64_ret(ptr addrspace(1) %out, ptr addrspa
; GFX12-LABEL: atomic_xor_i64_ret:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[4:5] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5597,10 +5597,10 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr addrspace(1) %out, i64 %in,
; VI-LABEL: atomic_xor_i64_addr64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -5631,11 +5631,11 @@ define amdgpu_kernel void @atomic_xor_i64_addr64(ptr addrspace(1) %out, i64 %in,
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_xor_b64 v2, v[0:1], s[0:1]
@@ -5672,54 +5672,54 @@ define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_xor_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_xor_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_xor_i64_ret_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_mov_b32_e32 v0, s8
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_xor_b64 v[0:1], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5786,11 +5786,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr addrspace(1) %out, i64
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5859,11 +5859,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr addrspace(1) %out, i64
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[4:5] offset:72000
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -5897,50 +5897,50 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr addrspace(1) %out,
;
; VI-LABEL: atomic_cmpxchg_i64_ret_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
-; VI-NEXT: s_mov_b32 s2, s10
-; VI-NEXT: s_mov_b32 s3, s11
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 offset:32 glc
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: s_mov_b32 s6, s2
+; VI-NEXT: s_mov_b32 s7, s3
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_cmpxchg_i64_ret_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: v_mov_b32_e32 v3, s7
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] offset:32 glc
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: v_mov_b32_e32 v2, s10
+; GFX9-NEXT: v_mov_b32_e32 v3, s11
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_cmpxchg_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
-; GFX12-NEXT: v_mov_b32_e32 v2, s6
-; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s11
+; GFX12-NEXT: v_mov_b32_e32 v2, s10
+; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v4, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v4, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5974,18 +5974,18 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou
;
; VI-LABEL: atomic_cmpxchg_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[8:9], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -5994,16 +5994,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou
;
; GFX9-LABEL: atomic_cmpxchg_i64_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
-; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[8:9], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_mov_b32_e32 v2, s10
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
+; GFX9-NEXT: v_mov_b32_e32 v3, s11
; GFX9-NEXT: global_atomic_cmpswap_x2 v4, v[0:3], s[0:1] offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
@@ -6011,14 +6011,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %ou
;
; GFX12-LABEL: atomic_cmpxchg_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s7
-; GFX12-NEXT: v_mov_b32_e32 v2, s6
-; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s11
+; GFX12-NEXT: v_mov_b32_e32 v2, s10
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[8:9], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[0:1] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6058,19 +6058,19 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1)
; VI-LABEL: atomic_cmpxchg_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: s_add_u32 s0, s4, s2
-; VI-NEXT: s_addc_u32 s3, s5, s3
-; VI-NEXT: s_add_u32 s2, s0, 32
-; VI-NEXT: s_addc_u32 s3, s3, 0
-; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
+; VI-NEXT: s_add_u32 s0, s0, 32
+; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v1, s9
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -6084,17 +6084,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1)
; GFX9-LABEL: atomic_cmpxchg_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
-; GFX9-NEXT: s_add_u32 s2, s4, s2
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_mov_b32_e32 v1, s9
-; GFX9-NEXT: s_addc_u32 s3, s5, s3
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] offset:32 glc
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7]
@@ -6104,11 +6104,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1)
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x44
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x44
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s9
-; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
@@ -6184,11 +6184,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64(ptr addrspace(1) %out, i64 %in, i6
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6221,50 +6221,50 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr addrspace(1) %out, ptr add
;
; VI-LABEL: atomic_cmpxchg_i64_ret:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 glc
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_cmpxchg_i64_ret:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: v_mov_b32_e32 v3, s7
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: v_mov_b32_e32 v2, s10
+; GFX9-NEXT: v_mov_b32_e32 v3, s11
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_cmpxchg_i64_ret:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
-; GFX12-NEXT: v_mov_b32_e32 v2, s6
-; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s11
+; GFX12-NEXT: v_mov_b32_e32 v2, s10
+; GFX12-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[4:5] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v4, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v4, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6297,16 +6297,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64
;
; VI-LABEL: atomic_cmpxchg_i64_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[8:9], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -6315,16 +6315,16 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64
;
; GFX9-LABEL: atomic_cmpxchg_i64_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s4
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: s_addc_u32 s1, s1, s5
-; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[8:9], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: v_mov_b32_e32 v2, s10
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
+; GFX9-NEXT: v_mov_b32_e32 v3, s11
; GFX9-NEXT: global_atomic_cmpswap_x2 v4, v[0:3], s[0:1]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
@@ -6332,14 +6332,14 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64
;
; GFX12-LABEL: atomic_cmpxchg_i64_addr64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s7
-; GFX12-NEXT: v_mov_b32_e32 v2, s6
-; GFX12-NEXT: s_lshl_b64 s[2:3], s[4:5], 3
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s11
+; GFX12-NEXT: v_mov_b32_e32 v2, s10
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[8:9], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_cmpswap_b64 v4, v[0:3], s[0:1]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
@@ -6378,17 +6378,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out,
; VI-LABEL: atomic_cmpxchg_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
-; VI-NEXT: s_add_u32 s2, s4, s2
-; VI-NEXT: s_addc_u32 s3, s5, s3
-; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v1, s9
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -6402,17 +6402,17 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out,
; GFX9-LABEL: atomic_cmpxchg_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
-; GFX9-NEXT: s_add_u32 s2, s4, s2
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_mov_b32_e32 v1, s9
-; GFX9-NEXT: s_addc_u32 s3, s5, s3
-; GFX9-NEXT: v_mov_b32_e32 v2, s0
-; GFX9-NEXT: v_mov_b32_e32 v3, s1
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7]
@@ -6422,11 +6422,11 @@ define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out,
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x44
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x44
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s9
-; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s1
-; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
@@ -6464,42 +6464,42 @@ define amdgpu_kernel void @atomic_load_i64_offset(ptr addrspace(1) %in, ptr addr
;
; VI-LABEL: atomic_load_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s0, s0, 32
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_add_u32 s0, s4, 32
+; VI-NEXT: s_addc_u32 s1, s5, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] offset:32 glc
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-NEXT: global_load_b64 v[0:1], v2, s[4:5] offset:32 th:TH_LOAD_NT
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6531,42 +6531,42 @@ define amdgpu_kernel void @atomic_load_i64_neg_offset(ptr addrspace(1) %in, ptr
;
; VI-LABEL: atomic_load_i64_neg_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s0, s0, 0xffffffe0
-; VI-NEXT: s_addc_u32 s1, s1, -1
+; VI-NEXT: s_add_u32 s0, s4, 0xffffffe0
+; VI-NEXT: s_addc_u32 s1, s5, -1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i64_neg_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] offset:-32 glc
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] offset:-32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_i64_neg_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:-32 th:TH_LOAD_NT
+; GFX12-NEXT: global_load_b64 v[0:1], v2, s[4:5] offset:-32 th:TH_LOAD_NT
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6596,40 +6596,40 @@ define amdgpu_kernel void @atomic_load_i64(ptr addrspace(1) %in, ptr addrspace(1
;
; VI-LABEL: atomic_load_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_load_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] th:TH_LOAD_NT
+; GFX12-NEXT: global_load_b64 v[0:1], v2, s[4:5] th:TH_LOAD_NT
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6662,14 +6662,12 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr addrspace(1) %in, p
;
; VI-LABEL: atomic_load_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -6677,9 +6675,11 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr addrspace(1) %in, p
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i64_addr64_offset:
@@ -6700,17 +6700,17 @@ define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr addrspace(1) %in, p
; GFX12-LABEL: atomic_load_i64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6745,22 +6745,22 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr addrspace(1) %in, ptr addr
;
; VI-LABEL: atomic_load_i64_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_i64_addr64:
@@ -6781,17 +6781,17 @@ define amdgpu_kernel void @atomic_load_i64_addr64(ptr addrspace(1) %in, ptr addr
; GFX12-LABEL: atomic_load_i64_addr64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] th:TH_LOAD_NT
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6825,14 +6825,12 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr addrspace(1) %in, p
;
; VI-LABEL: atomic_load_f64_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; VI-NEXT: s_add_u32 s0, s0, s4
-; VI-NEXT: s_addc_u32 s1, s1, s5
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; VI-NEXT: s_add_u32 s0, s4, s0
+; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -6840,9 +6838,11 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr addrspace(1) %in, p
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_load_f64_addr64_offset:
@@ -6863,17 +6863,17 @@ define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr addrspace(1) %in, p
; GFX12-LABEL: atomic_load_f64_addr64_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_load_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6901,34 +6901,34 @@ define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr addrspace(1) %ou
;
; VI-LABEL: atomic_store_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_add_u32 s0, s2, 32
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: s_add_u32 s0, s6, 32
+; VI-NEXT: s_addc_u32 s1, s7, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_store_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] offset:32
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_store_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7] offset:32
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6954,32 +6954,32 @@ define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr addrspace(1) %out) {
;
; VI-LABEL: atomic_store_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_store_i64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_store_i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -7008,10 +7008,10 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr addrspace
; VI-LABEL: atomic_store_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s6, s0
; VI-NEXT: s_addc_u32 s1, s7, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -7040,11 +7040,11 @@ define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr addrspace
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1]
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] offset:32
@@ -7078,10 +7078,10 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr addrspace(1) %ou
; VI-LABEL: atomic_store_i64_addr64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s6, s0
; VI-NEXT: s_addc_u32 s1, s7, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -7108,11 +7108,11 @@ define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr addrspace(1) %ou
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1]
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
@@ -7145,10 +7145,10 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr addrsp
; VI-LABEL: atomic_store_f64_addr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s6, s0
; VI-NEXT: s_addc_u32 s1, s7, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -7177,11 +7177,11 @@ define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr addrsp
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1]
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] offset:32
@@ -7211,36 +7211,36 @@ define amdgpu_kernel void @atomic_inc_i64_offset(ptr addrspace(1) %out, i64 %in)
;
; VI-LABEL: atomic_inc_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_inc_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_inc_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -7306,15 +7306,15 @@ define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-LABEL: atomic_inc_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -7346,10 +7346,10 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr addrspace(1) %out, i
; VI-LABEL: atomic_inc_i64_incr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -7382,11 +7382,11 @@ define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr addrspace(1) %out, i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32
@@ -7416,36 +7416,36 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr addrspace(1) %out, i64 %in)
;
; VI-LABEL: atomic_dec_i64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[0:3], 0 offset:32
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 offset:32
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_dec_i64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[0:1] offset:32
+; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[4:5] offset:32
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_dec_i64_offset:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-NEXT: global_atomic_dec_u64 v2, v[0:1], s[4:5] offset:32
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_endpgm
@@ -7511,15 +7511,15 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr addrspace(1) %out, ptr
; GFX12-LABEL: atomic_dec_i64_ret_offset:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: global_atomic_dec_u64 v[0:1], v2, v[0:1], s[0:1] offset:32 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_atomic_dec_u64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -7551,10 +7551,10 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr addrspace(1) %out, i
; VI-LABEL: atomic_dec_i64_decr64_offset:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; VI-NEXT: s_add_u32 s0, s4, s0
; VI-NEXT: s_addc_u32 s1, s5, s1
; VI-NEXT: s_add_u32 s0, s0, 32
@@ -7587,11 +7587,11 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr addrspace(1) %out, i
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GFX12-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[4:5], s[0:1]
; GFX12-NEXT: global_atomic_dec_u64 v2, v[0:1], s[0:1] offset:32
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
index cafd35afea6eb..200aa198bb8cf 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
@@ -4905,26 +4905,26 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i
;
; VI-LABEL: atomic_max_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s6
-; VI-NEXT: s_addc_u32 s1, s1, s7
-; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20
-; VI-NEXT: s_add_u32 s0, s0, 32
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v6, s3
+; VI-NEXT: s_lshl_b64 s[2:3], s[2:3], 3
+; VI-NEXT: s_add_u32 s2, s4, s2
+; VI-NEXT: s_addc_u32 s3, s5, s3
+; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x20
+; VI-NEXT: s_add_u32 s2, s2, 32
+; VI-NEXT: s_addc_u32 s3, s3, 0
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v6, s7
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v7, s2
-; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v7, s6
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: .LBB88_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
+; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -4932,9 +4932,9 @@ define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; VI-NEXT: v_mov_b32_e32 v2, v0
-; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; VI-NEXT: s_cbranch_execnz .LBB88_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_endpgm
@@ -5025,76 +5025,76 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %ou
;
; VI-LABEL: atomic_max_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b64 s[8:9], 0
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s6
-; VI-NEXT: s_addc_u32 s1, s1, s7
-; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20
-; VI-NEXT: s_add_u32 s0, s0, 32
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v4, s5
+; VI-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
+; VI-NEXT: s_add_u32 s2, s4, s2
+; VI-NEXT: s_addc_u32 s3, s5, s3
+; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x20
+; VI-NEXT: s_add_u32 s2, s2, 32
+; VI-NEXT: s_addc_u32 s3, s3, 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v4, s9
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v5, s4
-; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v5, s8
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: .LBB89_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v9, v3
; VI-NEXT: v_mov_b32_e32 v8, v2
-; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[8:9]
; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
-; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; VI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; VI-NEXT: s_cbranch_execnz .LBB89_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[8:9]
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: s_or_b64 exec, exec, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_max_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s6
-; GFX9-NEXT: s_addc_u32 s1, s1, s7
-; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20
-; GFX9-NEXT: s_mov_b64 s[6:7], 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20
+; GFX9-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-NEXT: v_mov_b32_e32 v3, s8
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s8
-; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: .LBB89_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v8, v1
; GFX9-NEXT: v_mov_b32_e32 v7, v0
-; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[7:8]
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[7:8]
; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
-; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB89_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -5146,24 +5146,24 @@ define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in,
;
; VI-LABEL: atomic_max_i64_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
-; VI-NEXT: s_add_u32 s4, s0, s4
-; VI-NEXT: s_addc_u32 s5, s1, s5
-; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
+; VI-NEXT: s_add_u32 s2, s4, s0
+; VI-NEXT: s_addc_u32 s3, s5, s1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: s_mov_b64 s[0:1], 0
-; VI-NEXT: v_mov_b32_e32 v6, s3
-; VI-NEXT: v_mov_b32_e32 v7, s2
+; VI-NEXT: v_mov_b32_e32 v6, s7
+; VI-NEXT: v_mov_b32_e32 v7, s6
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: .LBB90_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
+; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -5263,25 +5263,25 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_max_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; VI-NEXT: s_add_u32 s6, s0, s6
-; VI-NEXT: s_addc_u32 s7, s1, s7
-; VI-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s2, s4, s0
+; VI-NEXT: s_addc_u32 s3, s5, s1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: s_mov_b64 s[0:1], 0
-; VI-NEXT: v_mov_b32_e32 v4, s5
-; VI-NEXT: v_mov_b32_e32 v5, s4
+; VI-NEXT: v_mov_b32_e32 v4, s9
+; VI-NEXT: v_mov_b32_e32 v5, s8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s8
-; VI-NEXT: v_mov_b32_e32 v3, s9
-; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: .LBB91_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v9, v3
; VI-NEXT: v_mov_b32_e32 v8, v2
-; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[8:9]
; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -5293,44 +5293,44 @@ define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr
; VI-NEXT: s_cbranch_execnz .LBB91_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_or_b64 exec, exec, s[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_max_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s6
-; GFX9-NEXT: s_addc_u32 s1, s1, s7
-; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[6:7], 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-NEXT: v_mov_b32_e32 v3, s8
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s8
-; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: .LBB91_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v8, v1
; GFX9-NEXT: v_mov_b32_e32 v7, v0
-; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[7:8]
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[7:8]
; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
-; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB91_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -6367,26 +6367,26 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out,
;
; VI-LABEL: atomic_umax_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s6
-; VI-NEXT: s_addc_u32 s1, s1, s7
-; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20
-; VI-NEXT: s_add_u32 s0, s0, 32
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v6, s3
+; VI-NEXT: s_lshl_b64 s[2:3], s[2:3], 3
+; VI-NEXT: s_add_u32 s2, s4, s2
+; VI-NEXT: s_addc_u32 s3, s5, s3
+; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x20
+; VI-NEXT: s_add_u32 s2, s2, 32
+; VI-NEXT: s_addc_u32 s3, s3, 0
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v6, s7
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v7, s2
-; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v7, s6
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: .LBB102_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
+; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -6394,9 +6394,9 @@ define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out,
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; VI-NEXT: v_mov_b32_e32 v2, v0
-; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; VI-NEXT: s_cbranch_execnz .LBB102_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_endpgm
@@ -6487,76 +6487,76 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %o
;
; VI-LABEL: atomic_umax_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b64 s[8:9], 0
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s6
-; VI-NEXT: s_addc_u32 s1, s1, s7
-; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20
-; VI-NEXT: s_add_u32 s0, s0, 32
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v4, s5
+; VI-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
+; VI-NEXT: s_add_u32 s2, s4, s2
+; VI-NEXT: s_addc_u32 s3, s5, s3
+; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x20
+; VI-NEXT: s_add_u32 s2, s2, 32
+; VI-NEXT: s_addc_u32 s3, s3, 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v4, s9
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v5, s4
-; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v5, s8
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: .LBB103_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v9, v3
; VI-NEXT: v_mov_b32_e32 v8, v2
-; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[8:9]
; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
-; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; VI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; VI-NEXT: s_cbranch_execnz .LBB103_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[8:9]
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: s_or_b64 exec, exec, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s6
-; GFX9-NEXT: s_addc_u32 s1, s1, s7
-; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20
-; GFX9-NEXT: s_mov_b64 s[6:7], 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20
+; GFX9-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-NEXT: v_mov_b32_e32 v3, s8
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s8
-; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: .LBB103_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v8, v1
; GFX9-NEXT: v_mov_b32_e32 v7, v0
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8]
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[7:8]
; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
-; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB103_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -6613,25 +6613,25 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_umax_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; VI-NEXT: s_add_u32 s6, s0, s6
-; VI-NEXT: s_addc_u32 s7, s1, s7
-; VI-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s2, s4, s0
+; VI-NEXT: s_addc_u32 s3, s5, s1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: s_mov_b64 s[0:1], 0
-; VI-NEXT: v_mov_b32_e32 v4, s5
-; VI-NEXT: v_mov_b32_e32 v5, s4
+; VI-NEXT: v_mov_b32_e32 v4, s9
+; VI-NEXT: v_mov_b32_e32 v5, s8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s8
-; VI-NEXT: v_mov_b32_e32 v3, s9
-; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: .LBB104_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v9, v3
; VI-NEXT: v_mov_b32_e32 v8, v2
-; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[8:9]
; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -6643,44 +6643,44 @@ define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr
; VI-NEXT: s_cbranch_execnz .LBB104_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_or_b64 exec, exec, s[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_umax_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s6
-; GFX9-NEXT: s_addc_u32 s1, s1, s7
-; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[6:7], 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-NEXT: v_mov_b32_e32 v3, s8
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s8
-; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: .LBB104_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v8, v1
; GFX9-NEXT: v_mov_b32_e32 v7, v0
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[7:8]
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[7:8]
; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
-; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB104_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -8703,26 +8703,26 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i
;
; VI-LABEL: atomic_min_i64_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s6
-; VI-NEXT: s_addc_u32 s1, s1, s7
-; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20
-; VI-NEXT: s_add_u32 s0, s0, 32
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v6, s3
+; VI-NEXT: s_lshl_b64 s[2:3], s[2:3], 3
+; VI-NEXT: s_add_u32 s2, s4, s2
+; VI-NEXT: s_addc_u32 s3, s5, s3
+; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x20
+; VI-NEXT: s_add_u32 s2, s2, 32
+; VI-NEXT: s_addc_u32 s3, s3, 0
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v6, s7
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v7, s2
-; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v7, s6
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: .LBB125_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
+; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -8730,9 +8730,9 @@ define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; VI-NEXT: v_mov_b32_e32 v2, v0
-; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; VI-NEXT: s_cbranch_execnz .LBB125_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_endpgm
@@ -8823,76 +8823,76 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %ou
;
; VI-LABEL: atomic_min_i64_ret_addr64_offset:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b64 s[8:9], 0
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; VI-NEXT: s_add_u32 s0, s0, s6
-; VI-NEXT: s_addc_u32 s1, s1, s7
-; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x20
-; VI-NEXT: s_add_u32 s0, s0, 32
-; VI-NEXT: s_addc_u32 s1, s1, 0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v4, s5
+; VI-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
+; VI-NEXT: s_add_u32 s2, s4, s2
+; VI-NEXT: s_addc_u32 s3, s5, s3
+; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x20
+; VI-NEXT: s_add_u32 s2, s2, 32
+; VI-NEXT: s_addc_u32 s3, s3, 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v4, s9
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v5, s4
-; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v5, s8
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: .LBB126_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v9, v3
; VI-NEXT: v_mov_b32_e32 v8, v2
-; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[8:9]
; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
-; VI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; VI-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; VI-NEXT: s_cbranch_execnz .LBB126_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
-; VI-NEXT: s_or_b64 exec, exec, s[8:9]
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: s_or_b64 exec, exec, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_min_i64_ret_addr64_offset:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s6
-; GFX9-NEXT: s_addc_u32 s1, s1, s7
-; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x20
-; GFX9-NEXT: s_mov_b64 s[6:7], 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x20
+; GFX9-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-NEXT: v_mov_b32_e32 v3, s8
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s8
-; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: .LBB126_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v8, v1
; GFX9-NEXT: v_mov_b32_e32 v7, v0
-; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[7:8]
+; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[7:8]
; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] offset:32 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
-; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB126_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
@@ -8942,20 +8942,20 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) {
;
; VI-LABEL: atomic_min_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v6, s3
-; VI-NEXT: v_mov_b32_e32 v7, s2
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v6, s7
+; VI-NEXT: v_mov_b32_e32 v7, s6
+; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: .LBB127_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
-; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
+; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
; VI-NEXT: v_cndmask_b32_e32 v1, v6, v3, vcc
; VI-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -8963,38 +8963,38 @@ define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) {
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, v1
-; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; VI-NEXT: v_mov_b32_e32 v2, v0
-; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; VI-NEXT: s_andn2_b64 exec, exec, s[0:1]
; VI-NEXT: s_cbranch_execnz .LBB127_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_min_i64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
-; GFX9-NEXT: v_mov_b32_e32 v4, s3
-; GFX9-NEXT: v_mov_b32_e32 v5, s2
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v4, s7
+; GFX9-NEXT: v_mov_b32_e32 v5, s6
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: .LBB127_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
+; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[4:5] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB127_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_endpgm
@@ -9050,25 +9050,25 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr
;
; VI-LABEL: atomic_min_i64_ret_addr64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; VI-NEXT: s_add_u32 s6, s0, s6
-; VI-NEXT: s_addc_u32 s7, s1, s7
-; VI-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; VI-NEXT: s_add_u32 s2, s4, s0
+; VI-NEXT: s_addc_u32 s3, s5, s1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: s_mov_b64 s[0:1], 0
-; VI-NEXT: v_mov_b32_e32 v4, s5
-; VI-NEXT: v_mov_b32_e32 v5, s4
+; VI-NEXT: v_mov_b32_e32 v4, s9
+; VI-NEXT: v_mov_b32_e32 v5, s8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s8
-; VI-NEXT: v_mov_b32_e32 v3, s9
-; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: .LBB128_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v9, v3
; VI-NEXT: v_mov_b32_e32 v8, v2
-; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; VI-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[8:9]
; VI-NEXT: v_cndmask_b32_e32 v7, v4, v9, vcc
; VI-NEXT: v_cndmask_b32_e32 v6, v5, v8, vcc
; VI-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
@@ -9080,44 +9080,44 @@ define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr
; VI-NEXT: s_cbranch_execnz .LBB128_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_or_b64 exec, exec, s[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: atomic_min_i64_ret_addr64:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
-; GFX9-NEXT: s_add_u32 s0, s0, s6
-; GFX9-NEXT: s_addc_u32 s1, s1, s7
-; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b64 s[6:7], 0
-; GFX9-NEXT: v_mov_b32_e32 v2, s5
-; GFX9-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT: s_add_u32 s0, s4, s0
+; GFX9-NEXT: s_addc_u32 s1, s5, s1
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-NEXT: v_mov_b32_e32 v3, s8
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s8
-; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: .LBB128_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v8, v1
; GFX9-NEXT: v_mov_b32_e32 v7, v0
-; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[4:5], v[7:8]
+; GFX9-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[7:8]
; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v7, vcc
; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[5:8], s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[7:8]
-; GFX9-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
; GFX9-NEXT: s_cbranch_execnz .LBB128_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 6555ceb3ed338..9d174bece11a1 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -54,95 +54,95 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
;
; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB0_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s6, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB0_2
; GFX9-NEXT: .LBB0_3:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB0_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB0_2
; GFX1064-NEXT: .LBB0_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB0_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s4
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB0_2
; GFX1032-NEXT: .LBB0_3:
; GFX1032-NEXT: s_endpgm
@@ -157,14 +157,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB0_2
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
+; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_mul_f32_e32 v0, 4.0, v0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: global_atomic_add_f32 v1, v0, s[0:1]
+; GFX1164-NEXT: global_atomic_add_f32 v1, v0, s[4:5]
; GFX1164-NEXT: .LBB0_2:
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -179,13 +179,13 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB0_2
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s2
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
+; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s0
; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, 4.0, v0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: global_atomic_add_f32 v1, v0, s[0:1]
+; GFX1132-NEXT: global_atomic_add_f32 v1, v0, s[4:5]
; GFX1132-NEXT: .LBB0_2:
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -193,95 +193,95 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
;
; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
; GFX9-DPP: ; %bb.0:
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
-; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s4
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s6, s[2:3], 0x0
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s6
; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX9-DPP-NEXT: .LBB0_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
; GFX1064-DPP: ; %bb.0:
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1064-DPP-NEXT: .LBB0_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1032-DPP-NEXT: .LBB0_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -296,14 +296,14 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_2
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
+; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mul_f32_e32 v0, 4.0, v0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: global_atomic_add_f32 v1, v0, s[0:1]
+; GFX1164-DPP-NEXT: global_atomic_add_f32 v1, v0, s[4:5]
; GFX1164-DPP-NEXT: .LBB0_2:
; GFX1164-DPP-NEXT: s_nop 0
; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -318,13 +318,13 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_2
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s2
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
+; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s0
; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, 4.0, v0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: global_atomic_add_f32 v1, v0, s[0:1]
+; GFX1132-DPP-NEXT: global_atomic_add_f32 v1, v0, s[4:5]
; GFX1132-DPP-NEXT: .LBB0_2:
; GFX1132-DPP-NEXT: s_nop 0
; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1119,11 +1119,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
@@ -1131,12 +1131,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX9-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB2_2
; GFX9-NEXT: .LBB2_3:
; GFX9-NEXT: s_endpgm
@@ -1158,64 +1158,64 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB2_2
; GFX1064-NEXT: .LBB2_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB2_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032-NEXT: s_mov_b32 s3, 0x43300000
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB2_2
; GFX1032-NEXT: .LBB2_3:
; GFX1032-NEXT: s_endpgm
@@ -1238,27 +1238,27 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB2_2
; GFX1164-NEXT: .LBB2_3:
; GFX1164-NEXT: s_endpgm
@@ -1269,8 +1269,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-NEXT: scratch_store_b32 off, v1, off
@@ -1280,25 +1280,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mul_f32 v2, 4.0, v0
; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB2_2
; GFX1132-NEXT: .LBB2_3:
; GFX1132-NEXT: s_endpgm
@@ -1323,11 +1323,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
@@ -1335,12 +1335,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX9-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX9-DPP-NEXT: .LBB2_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -1362,64 +1362,64 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1064-DPP-NEXT: .LBB2_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1032-DPP-NEXT: .LBB2_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -1442,27 +1442,27 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1164-DPP-NEXT: .LBB2_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -1473,8 +1473,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
@@ -1484,25 +1484,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mul_f32 v2, 4.0, v0
; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1132-DPP-NEXT: .LBB2_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -2349,11 +2349,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
@@ -2361,12 +2361,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX9-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB4_2
; GFX9-NEXT: .LBB4_3:
; GFX9-NEXT: s_endpgm
@@ -2388,64 +2388,64 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB4_2
; GFX1064-NEXT: .LBB4_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB4_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032-NEXT: s_mov_b32 s3, 0x43300000
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB4_2
; GFX1032-NEXT: .LBB4_3:
; GFX1032-NEXT: s_endpgm
@@ -2468,27 +2468,27 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB4_2
; GFX1164-NEXT: .LBB4_3:
; GFX1164-NEXT: s_endpgm
@@ -2499,8 +2499,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-NEXT: scratch_store_b32 off, v1, off
@@ -2510,25 +2510,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mul_f32 v2, 4.0, v0
; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB4_2
; GFX1132-NEXT: .LBB4_3:
; GFX1132-NEXT: s_endpgm
@@ -2553,11 +2553,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
@@ -2565,12 +2565,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX9-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX9-DPP-NEXT: .LBB4_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -2592,64 +2592,64 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1064-DPP-NEXT: .LBB4_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1032-DPP-NEXT: .LBB4_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -2672,27 +2672,27 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1164-DPP-NEXT: .LBB4_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -2703,8 +2703,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
@@ -2714,25 +2714,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mul_f32 v2, 4.0, v0
; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1132-DPP-NEXT: .LBB4_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -4247,11 +4247,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
@@ -4259,12 +4259,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
; GFX9-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB7_2
; GFX9-NEXT: .LBB7_3:
; GFX9-NEXT: s_endpgm
@@ -4286,64 +4286,64 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB7_2
; GFX1064-NEXT: .LBB7_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB7_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032-NEXT: s_mov_b32 s3, 0x43300000
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB7_2
; GFX1032-NEXT: .LBB7_3:
; GFX1032-NEXT: s_endpgm
@@ -4366,27 +4366,27 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB7_2
; GFX1164-NEXT: .LBB7_3:
; GFX1164-NEXT: s_endpgm
@@ -4397,8 +4397,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-NEXT: scratch_store_b32 off, v1, off
@@ -4408,25 +4408,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mul_f32 v2, 4.0, v0
; GFX1132-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB7_2
; GFX1132-NEXT: .LBB7_3:
; GFX1132-NEXT: s_endpgm
@@ -4451,11 +4451,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
@@ -4463,12 +4463,12 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
; GFX9-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_2
; GFX9-DPP-NEXT: .LBB7_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -4490,64 +4490,64 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_2
; GFX1064-DPP-NEXT: .LBB7_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_2
; GFX1032-DPP-NEXT: .LBB7_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -4570,27 +4570,27 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_2
; GFX1164-DPP-NEXT: .LBB7_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -4601,8 +4601,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
@@ -4612,25 +4612,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mul_f32 v2, 4.0, v0
; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2
; GFX1132-DPP-NEXT: .LBB7_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -5452,101 +5452,101 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
;
; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB9_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX9-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB9_2
; GFX9-NEXT: .LBB9_3:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB9_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1064-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB9_2
; GFX1064-NEXT: .LBB9_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB9_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s5
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
+; GFX1032-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB9_2
; GFX1032-NEXT: .LBB9_3:
; GFX1032-NEXT: s_endpgm
@@ -5562,165 +5562,165 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1164-NEXT: s_cbranch_execz .LBB9_3
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB9_2
; GFX1164-NEXT: .LBB9_3:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1132-NEXT: s_mov_b32 s4, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB9_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
+; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2
; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB9_2
; GFX1132-NEXT: .LBB9_3:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX9-DPP: ; %bb.0:
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX9-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX9-DPP-NEXT: .LBB9_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1064-DPP: ; %bb.0:
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX1064-DPP-NEXT: .LBB9_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s5
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
+; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX1032-DPP-NEXT: .LBB9_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -5736,64 +5736,64 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX1164-DPP-NEXT: .LBB9_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
+; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, s2
; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX1132-DPP-NEXT: .LBB9_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -6349,11 +6349,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
@@ -6361,13 +6361,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX9-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB11_2
; GFX9-NEXT: .LBB11_3:
; GFX9-NEXT: s_endpgm
@@ -6387,68 +6387,68 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB11_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[2:3]
+; GFX1064-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB11_2
; GFX1064-NEXT: .LBB11_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB11_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s2
+; GFX1032-NEXT: s_mov_b32 s7, 0x43300000
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB11_2
; GFX1032-NEXT: .LBB11_3:
; GFX1032-NEXT: s_endpgm
@@ -6471,28 +6471,28 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB11_2
; GFX1164-NEXT: .LBB11_3:
; GFX1164-NEXT: s_endpgm
@@ -6503,8 +6503,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-NEXT: scratch_store_b32 off, v1, off
@@ -6514,25 +6514,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB11_2
; GFX1132-NEXT: .LBB11_3:
; GFX1132-NEXT: s_endpgm
@@ -6557,11 +6557,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
@@ -6569,13 +6569,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX9-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_2
; GFX9-DPP-NEXT: .LBB11_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -6595,68 +6595,68 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[2:3]
+; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_2
; GFX1064-DPP-NEXT: .LBB11_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2
; GFX1032-DPP-NEXT: .LBB11_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -6679,28 +6679,28 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_2
; GFX1164-DPP-NEXT: .LBB11_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -6711,8 +6711,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
@@ -6722,25 +6722,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2
; GFX1132-DPP-NEXT: .LBB11_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -7296,11 +7296,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
@@ -7308,13 +7308,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX9-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB13_2
; GFX9-NEXT: .LBB13_3:
; GFX9-NEXT: s_endpgm
@@ -7334,68 +7334,68 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB13_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[2:3]
+; GFX1064-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB13_2
; GFX1064-NEXT: .LBB13_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB13_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s2
+; GFX1032-NEXT: s_mov_b32 s7, 0x43300000
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB13_2
; GFX1032-NEXT: .LBB13_3:
; GFX1032-NEXT: s_endpgm
@@ -7418,28 +7418,28 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB13_2
; GFX1164-NEXT: .LBB13_3:
; GFX1164-NEXT: s_endpgm
@@ -7450,8 +7450,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-NEXT: scratch_store_b32 off, v1, off
@@ -7461,25 +7461,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB13_2
; GFX1132-NEXT: .LBB13_3:
; GFX1132-NEXT: s_endpgm
@@ -7504,11 +7504,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
@@ -7516,13 +7516,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX9-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX9-DPP-NEXT: .LBB13_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -7542,68 +7542,68 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[2:3]
+; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1064-DPP-NEXT: .LBB13_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1032-DPP-NEXT: .LBB13_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -7626,28 +7626,28 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1164-DPP-NEXT: .LBB13_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -7658,8 +7658,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
@@ -7669,25 +7669,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1132-DPP-NEXT: .LBB13_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -8721,11 +8721,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
@@ -8733,13 +8733,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX9-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB16_2
; GFX9-NEXT: .LBB16_3:
; GFX9-NEXT: s_endpgm
@@ -8759,68 +8759,68 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB16_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[2:3]
+; GFX1064-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB16_2
; GFX1064-NEXT: .LBB16_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB16_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s2
+; GFX1032-NEXT: s_mov_b32 s7, 0x43300000
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB16_2
; GFX1032-NEXT: .LBB16_3:
; GFX1032-NEXT: s_endpgm
@@ -8843,28 +8843,28 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB16_2
; GFX1164-NEXT: .LBB16_3:
; GFX1164-NEXT: s_endpgm
@@ -8875,8 +8875,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-NEXT: scratch_store_b32 off, v1, off
@@ -8886,25 +8886,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB16_2
; GFX1132-NEXT: .LBB16_3:
; GFX1132-NEXT: s_endpgm
@@ -8929,11 +8929,11 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
@@ -8941,13 +8941,13 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX9-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX9-DPP-NEXT: .LBB16_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -8967,68 +8967,68 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB16_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[2:3]
+; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX1064-DPP-NEXT: .LBB16_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB16_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX1032-DPP-NEXT: .LBB16_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -9051,28 +9051,28 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX1164-DPP-NEXT: .LBB16_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -9083,8 +9083,8 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
@@ -9094,25 +9094,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX1132-DPP-NEXT: .LBB16_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -9637,330 +9637,330 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
;
; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB18_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s6, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
; GFX9-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB18_2
; GFX9-NEXT: .LBB18_3:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB18_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB18_2
; GFX1064-NEXT: .LBB18_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB18_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s4
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB18_2
; GFX1032-NEXT: .LBB18_3:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1164: ; %bb.0:
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB18_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1164-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB18_2
; GFX1164-NEXT: .LBB18_3:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1132-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132-NEXT: s_mov_b32 s5, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB18_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
-; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4
+; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s0
; GFX1132-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB18_2
; GFX1132-NEXT: .LBB18_3:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX9-DPP: ; %bb.0:
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB18_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
-; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s4
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s6, s[2:3], 0x0
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s6
; GFX9-DPP-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB18_2
; GFX9-DPP-NEXT: .LBB18_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1064-DPP: ; %bb.0:
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB18_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB18_2
; GFX1064-DPP-NEXT: .LBB18_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB18_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB18_2
; GFX1032-DPP-NEXT: .LBB18_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1164-DPP: ; %bb.0:
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB18_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB18_2
; GFX1164-DPP-NEXT: .LBB18_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s5, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB18_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4
+; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s0
; GFX1132-DPP-NEXT: .LBB18_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB18_2
; GFX1132-DPP-NEXT: .LBB18_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -10007,330 +10007,330 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
;
; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB19_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s6, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
; GFX9-NEXT: .LBB19_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB19_2
; GFX9-NEXT: .LBB19_3:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB19_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB19_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB19_2
; GFX1064-NEXT: .LBB19_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB19_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s4
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB19_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB19_2
; GFX1032-NEXT: .LBB19_3:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1164: ; %bb.0:
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB19_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1164-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB19_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB19_2
; GFX1164-NEXT: .LBB19_3:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1132-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132-NEXT: s_mov_b32 s5, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB19_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
-; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4
+; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s0
; GFX1132-NEXT: .LBB19_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB19_2
; GFX1132-NEXT: .LBB19_3:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX9-DPP: ; %bb.0:
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB19_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
-; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s4
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s6, s[2:3], 0x0
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s6
; GFX9-DPP-NEXT: .LBB19_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB19_2
; GFX9-DPP-NEXT: .LBB19_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1064-DPP: ; %bb.0:
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB19_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB19_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB19_2
; GFX1064-DPP-NEXT: .LBB19_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB19_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB19_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB19_2
; GFX1032-DPP-NEXT: .LBB19_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1164-DPP: ; %bb.0:
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB19_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB19_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB19_2
; GFX1164-DPP-NEXT: .LBB19_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s5, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB19_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4
+; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s0
; GFX1132-DPP-NEXT: .LBB19_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB19_2
; GFX1132-DPP-NEXT: .LBB19_3:
; GFX1132-DPP-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 6548792180a0e..fdb36b3f574d5 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -57,23 +57,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB0_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB0_2
; GFX9-NEXT: .LBB0_3:
; GFX9-NEXT: s_endpgm
@@ -86,23 +86,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB0_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB0_2
; GFX1064-NEXT: .LBB0_3:
; GFX1064-NEXT: s_endpgm
@@ -110,27 +110,27 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB0_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB0_2
; GFX1032-NEXT: .LBB0_3:
; GFX1032-NEXT: s_endpgm
@@ -144,25 +144,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB0_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB0_2
; GFX1164-NEXT: .LBB0_3:
; GFX1164-NEXT: s_endpgm
@@ -170,30 +170,30 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB0_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB0_2
; GFX1132-NEXT: .LBB0_3:
; GFX1132-NEXT: s_endpgm
@@ -206,23 +206,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX9-DPP-NEXT: .LBB0_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -235,23 +235,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1064-DPP-NEXT: .LBB0_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -259,27 +259,27 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1032-DPP-NEXT: .LBB0_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -293,25 +293,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1164-DPP-NEXT: .LBB0_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -319,30 +319,30 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1132-DPP-NEXT: .LBB0_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -1235,23 +1235,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB2_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB2_2
; GFX9-NEXT: .LBB2_3:
; GFX9-NEXT: s_endpgm
@@ -1264,23 +1264,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB2_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB2_2
; GFX1064-NEXT: .LBB2_3:
; GFX1064-NEXT: s_endpgm
@@ -1288,27 +1288,27 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB2_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB2_2
; GFX1032-NEXT: .LBB2_3:
; GFX1032-NEXT: s_endpgm
@@ -1322,25 +1322,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB2_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB2_2
; GFX1164-NEXT: .LBB2_3:
; GFX1164-NEXT: s_endpgm
@@ -1348,30 +1348,30 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB2_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB2_2
; GFX1132-NEXT: .LBB2_3:
; GFX1132-NEXT: s_endpgm
@@ -1384,23 +1384,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
; GFX9-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX9-DPP-NEXT: .LBB2_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -1413,23 +1413,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1064-DPP-NEXT: .LBB2_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -1437,27 +1437,27 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1032-DPP-NEXT: .LBB2_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -1471,25 +1471,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1164-DPP-NEXT: .LBB2_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -1497,30 +1497,30 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1132-DPP-NEXT: .LBB2_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -2415,23 +2415,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB4_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB4_2
; GFX9-NEXT: .LBB4_3:
; GFX9-NEXT: s_endpgm
@@ -2444,23 +2444,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB4_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB4_2
; GFX1064-NEXT: .LBB4_3:
; GFX1064-NEXT: s_endpgm
@@ -2468,27 +2468,27 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB4_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB4_2
; GFX1032-NEXT: .LBB4_3:
; GFX1032-NEXT: s_endpgm
@@ -2502,25 +2502,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB4_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB4_2
; GFX1164-NEXT: .LBB4_3:
; GFX1164-NEXT: s_endpgm
@@ -2528,30 +2528,30 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB4_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB4_2
; GFX1132-NEXT: .LBB4_3:
; GFX1132-NEXT: s_endpgm
@@ -2564,23 +2564,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
; GFX9-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX9-DPP-NEXT: .LBB4_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -2593,23 +2593,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1064-DPP-NEXT: .LBB4_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -2617,27 +2617,27 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1032-DPP-NEXT: .LBB4_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -2651,25 +2651,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1164-DPP-NEXT: .LBB4_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -2677,30 +2677,30 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1132-DPP-NEXT: .LBB4_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -3597,11 +3597,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB6_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
@@ -3609,13 +3609,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB6_2
; GFX9-NEXT: .LBB6_3:
; GFX9-NEXT: s_endpgm
@@ -3628,25 +3628,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB6_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB6_2
; GFX1064-NEXT: .LBB6_3:
; GFX1064-NEXT: s_endpgm
@@ -3654,29 +3654,29 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB6_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB6_2
; GFX1032-NEXT: .LBB6_3:
; GFX1032-NEXT: s_endpgm
@@ -3690,27 +3690,27 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB6_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB6_2
; GFX1164-NEXT: .LBB6_3:
; GFX1164-NEXT: s_endpgm
@@ -3718,30 +3718,30 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1132-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB6_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB6_2
; GFX1132-NEXT: .LBB6_3:
; GFX1132-NEXT: s_endpgm
@@ -3754,11 +3754,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
@@ -3766,13 +3766,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX9-DPP-NEXT: .LBB6_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -3785,25 +3785,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX1064-DPP-NEXT: .LBB6_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -3811,29 +3811,29 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX1032-DPP-NEXT: .LBB6_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -3847,27 +3847,27 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX1164-DPP-NEXT: .LBB6_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -3875,30 +3875,30 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX1132-DPP-NEXT: .LBB6_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -4456,11 +4456,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB8_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
@@ -4468,13 +4468,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB8_2
; GFX9-NEXT: .LBB8_3:
; GFX9-NEXT: s_endpgm
@@ -4487,25 +4487,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB8_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB8_2
; GFX1064-NEXT: .LBB8_3:
; GFX1064-NEXT: s_endpgm
@@ -4513,29 +4513,29 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB8_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB8_2
; GFX1032-NEXT: .LBB8_3:
; GFX1032-NEXT: s_endpgm
@@ -4549,27 +4549,27 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB8_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB8_2
; GFX1164-NEXT: .LBB8_3:
; GFX1164-NEXT: s_endpgm
@@ -4577,30 +4577,30 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX1132-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB8_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB8_2
; GFX1132-NEXT: .LBB8_3:
; GFX1132-NEXT: s_endpgm
@@ -4613,11 +4613,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
@@ -4625,13 +4625,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB8_2
; GFX9-DPP-NEXT: .LBB8_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -4644,25 +4644,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB8_2
; GFX1064-DPP-NEXT: .LBB8_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -4670,29 +4670,29 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB8_2
; GFX1032-DPP-NEXT: .LBB8_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -4706,27 +4706,27 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB8_2
; GFX1164-DPP-NEXT: .LBB8_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -4734,30 +4734,30 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_2
; GFX1132-DPP-NEXT: .LBB8_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -5315,11 +5315,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB10_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
@@ -5327,13 +5327,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB10_2
; GFX9-NEXT: .LBB10_3:
; GFX9-NEXT: s_endpgm
@@ -5346,25 +5346,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB10_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB10_2
; GFX1064-NEXT: .LBB10_3:
; GFX1064-NEXT: s_endpgm
@@ -5372,29 +5372,29 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB10_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB10_2
; GFX1032-NEXT: .LBB10_3:
; GFX1032-NEXT: s_endpgm
@@ -5408,27 +5408,27 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB10_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB10_2
; GFX1164-NEXT: .LBB10_3:
; GFX1164-NEXT: s_endpgm
@@ -5436,30 +5436,30 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1132-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB10_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB10_2
; GFX1132-NEXT: .LBB10_3:
; GFX1132-NEXT: s_endpgm
@@ -5472,11 +5472,11 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
@@ -5484,13 +5484,13 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX9-DPP-NEXT: .LBB10_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -5503,25 +5503,25 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX1064-DPP-NEXT: .LBB10_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -5529,29 +5529,29 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX1032-DPP-NEXT: .LBB10_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -5565,27 +5565,27 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX1164-DPP-NEXT: .LBB10_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -5593,30 +5593,30 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX1132-DPP-NEXT: .LBB10_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -6170,23 +6170,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB12_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB12_2
; GFX9-NEXT: .LBB12_3:
; GFX9-NEXT: s_endpgm
@@ -6199,23 +6199,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB12_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB12_2
; GFX1064-NEXT: .LBB12_3:
; GFX1064-NEXT: s_endpgm
@@ -6223,27 +6223,27 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB12_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB12_2
; GFX1032-NEXT: .LBB12_3:
; GFX1032-NEXT: s_endpgm
@@ -6257,25 +6257,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB12_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB12_2
; GFX1164-NEXT: .LBB12_3:
; GFX1164-NEXT: s_endpgm
@@ -6283,30 +6283,30 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB12_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB12_2
; GFX1132-NEXT: .LBB12_3:
; GFX1132-NEXT: s_endpgm
@@ -6319,23 +6319,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
; GFX9-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_2
; GFX9-DPP-NEXT: .LBB12_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -6348,23 +6348,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_2
; GFX1064-DPP-NEXT: .LBB12_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -6372,27 +6372,27 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_2
; GFX1032-DPP-NEXT: .LBB12_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -6406,25 +6406,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_2
; GFX1164-DPP-NEXT: .LBB12_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -6432,30 +6432,30 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_2
; GFX1132-DPP-NEXT: .LBB12_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -6505,23 +6505,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB13_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB13_2
; GFX9-NEXT: .LBB13_3:
; GFX9-NEXT: s_endpgm
@@ -6534,23 +6534,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB13_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB13_2
; GFX1064-NEXT: .LBB13_3:
; GFX1064-NEXT: s_endpgm
@@ -6558,27 +6558,27 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB13_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB13_2
; GFX1032-NEXT: .LBB13_3:
; GFX1032-NEXT: s_endpgm
@@ -6592,25 +6592,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB13_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB13_2
; GFX1164-NEXT: .LBB13_3:
; GFX1164-NEXT: s_endpgm
@@ -6618,30 +6618,30 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB13_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB13_2
; GFX1132-NEXT: .LBB13_3:
; GFX1132-NEXT: s_endpgm
@@ -6654,23 +6654,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
; GFX9-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX9-DPP-NEXT: .LBB13_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -6683,23 +6683,23 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1064-DPP-NEXT: .LBB13_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -6707,27 +6707,27 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1032-DPP-NEXT: .LBB13_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -6741,25 +6741,25 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1164-DPP-NEXT: .LBB13_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -6767,30 +6767,30 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1132-DPP-NEXT: .LBB13_3:
; GFX1132-DPP-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index 6936cdc4d379a..d47a424002615 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -57,23 +57,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB0_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB0_2
; GFX9-NEXT: .LBB0_3:
; GFX9-NEXT: s_endpgm
@@ -86,23 +86,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB0_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB0_2
; GFX1064-NEXT: .LBB0_3:
; GFX1064-NEXT: s_endpgm
@@ -110,27 +110,27 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB0_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB0_2
; GFX1032-NEXT: .LBB0_3:
; GFX1032-NEXT: s_endpgm
@@ -144,25 +144,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB0_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB0_2
; GFX1164-NEXT: .LBB0_3:
; GFX1164-NEXT: s_endpgm
@@ -170,30 +170,30 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB0_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB0_2
; GFX1132-NEXT: .LBB0_3:
; GFX1132-NEXT: s_endpgm
@@ -206,23 +206,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX9-DPP-NEXT: .LBB0_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -235,23 +235,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1064-DPP-NEXT: .LBB0_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -259,27 +259,27 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1032-DPP-NEXT: .LBB0_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -293,25 +293,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1164-DPP-NEXT: .LBB0_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -319,30 +319,30 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1132-DPP-NEXT: .LBB0_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -1235,23 +1235,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB2_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB2_2
; GFX9-NEXT: .LBB2_3:
; GFX9-NEXT: s_endpgm
@@ -1264,23 +1264,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB2_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB2_2
; GFX1064-NEXT: .LBB2_3:
; GFX1064-NEXT: s_endpgm
@@ -1288,27 +1288,27 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB2_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB2_2
; GFX1032-NEXT: .LBB2_3:
; GFX1032-NEXT: s_endpgm
@@ -1322,25 +1322,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB2_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB2_2
; GFX1164-NEXT: .LBB2_3:
; GFX1164-NEXT: s_endpgm
@@ -1348,30 +1348,30 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB2_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB2_2
; GFX1132-NEXT: .LBB2_3:
; GFX1132-NEXT: s_endpgm
@@ -1384,23 +1384,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
; GFX9-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX9-DPP-NEXT: .LBB2_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -1413,23 +1413,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1064-DPP-NEXT: .LBB2_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -1437,27 +1437,27 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1032-DPP-NEXT: .LBB2_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -1471,25 +1471,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1164-DPP-NEXT: .LBB2_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -1497,30 +1497,30 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1132-DPP-NEXT: .LBB2_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -2415,23 +2415,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB4_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB4_2
; GFX9-NEXT: .LBB4_3:
; GFX9-NEXT: s_endpgm
@@ -2444,23 +2444,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB4_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB4_2
; GFX1064-NEXT: .LBB4_3:
; GFX1064-NEXT: s_endpgm
@@ -2468,27 +2468,27 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB4_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB4_2
; GFX1032-NEXT: .LBB4_3:
; GFX1032-NEXT: s_endpgm
@@ -2502,25 +2502,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB4_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB4_2
; GFX1164-NEXT: .LBB4_3:
; GFX1164-NEXT: s_endpgm
@@ -2528,30 +2528,30 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB4_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB4_2
; GFX1132-NEXT: .LBB4_3:
; GFX1132-NEXT: s_endpgm
@@ -2564,23 +2564,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
; GFX9-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX9-DPP-NEXT: .LBB4_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -2593,23 +2593,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1064-DPP-NEXT: .LBB4_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -2617,27 +2617,27 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1032-DPP-NEXT: .LBB4_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -2651,25 +2651,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1164-DPP-NEXT: .LBB4_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -2677,30 +2677,30 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1132-DPP-NEXT: .LBB4_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -3597,11 +3597,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB6_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
@@ -3609,13 +3609,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB6_2
; GFX9-NEXT: .LBB6_3:
; GFX9-NEXT: s_endpgm
@@ -3628,25 +3628,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB6_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB6_2
; GFX1064-NEXT: .LBB6_3:
; GFX1064-NEXT: s_endpgm
@@ -3654,29 +3654,29 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB6_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB6_2
; GFX1032-NEXT: .LBB6_3:
; GFX1032-NEXT: s_endpgm
@@ -3690,27 +3690,27 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB6_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB6_2
; GFX1164-NEXT: .LBB6_3:
; GFX1164-NEXT: s_endpgm
@@ -3718,30 +3718,30 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1132-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB6_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB6_2
; GFX1132-NEXT: .LBB6_3:
; GFX1132-NEXT: s_endpgm
@@ -3754,11 +3754,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
@@ -3766,13 +3766,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX9-DPP-NEXT: .LBB6_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -3785,25 +3785,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX1064-DPP-NEXT: .LBB6_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -3811,29 +3811,29 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX1032-DPP-NEXT: .LBB6_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -3847,27 +3847,27 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX1164-DPP-NEXT: .LBB6_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -3875,30 +3875,30 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_2
; GFX1132-DPP-NEXT: .LBB6_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -4456,11 +4456,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB8_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
@@ -4468,13 +4468,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB8_2
; GFX9-NEXT: .LBB8_3:
; GFX9-NEXT: s_endpgm
@@ -4487,25 +4487,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB8_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB8_2
; GFX1064-NEXT: .LBB8_3:
; GFX1064-NEXT: s_endpgm
@@ -4513,29 +4513,29 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB8_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB8_2
; GFX1032-NEXT: .LBB8_3:
; GFX1032-NEXT: s_endpgm
@@ -4549,27 +4549,27 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB8_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB8_2
; GFX1164-NEXT: .LBB8_3:
; GFX1164-NEXT: s_endpgm
@@ -4577,30 +4577,30 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX1132-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB8_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB8_2
; GFX1132-NEXT: .LBB8_3:
; GFX1132-NEXT: s_endpgm
@@ -4613,11 +4613,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
@@ -4625,13 +4625,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB8_2
; GFX9-DPP-NEXT: .LBB8_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -4644,25 +4644,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB8_2
; GFX1064-DPP-NEXT: .LBB8_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -4670,29 +4670,29 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB8_2
; GFX1032-DPP-NEXT: .LBB8_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -4706,27 +4706,27 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB8_2
; GFX1164-DPP-NEXT: .LBB8_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -4734,30 +4734,30 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_2
; GFX1132-DPP-NEXT: .LBB8_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -5315,11 +5315,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB10_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
@@ -5327,13 +5327,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB10_2
; GFX9-NEXT: .LBB10_3:
; GFX9-NEXT: s_endpgm
@@ -5346,25 +5346,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB10_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB10_2
; GFX1064-NEXT: .LBB10_3:
; GFX1064-NEXT: s_endpgm
@@ -5372,29 +5372,29 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB10_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB10_2
; GFX1032-NEXT: .LBB10_3:
; GFX1032-NEXT: s_endpgm
@@ -5408,27 +5408,27 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB10_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB10_2
; GFX1164-NEXT: .LBB10_3:
; GFX1164-NEXT: s_endpgm
@@ -5436,30 +5436,30 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1132-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB10_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB10_2
; GFX1132-NEXT: .LBB10_3:
; GFX1132-NEXT: s_endpgm
@@ -5472,11 +5472,11 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
@@ -5484,13 +5484,13 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX9-DPP-NEXT: .LBB10_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -5503,25 +5503,25 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX1064-DPP-NEXT: .LBB10_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -5529,29 +5529,29 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX1032-DPP-NEXT: .LBB10_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -5565,27 +5565,27 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX1164-DPP-NEXT: .LBB10_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -5593,30 +5593,30 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2
; GFX1132-DPP-NEXT: .LBB10_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -6170,23 +6170,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB12_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB12_2
; GFX9-NEXT: .LBB12_3:
; GFX9-NEXT: s_endpgm
@@ -6199,23 +6199,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB12_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB12_2
; GFX1064-NEXT: .LBB12_3:
; GFX1064-NEXT: s_endpgm
@@ -6223,27 +6223,27 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB12_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB12_2
; GFX1032-NEXT: .LBB12_3:
; GFX1032-NEXT: s_endpgm
@@ -6257,25 +6257,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB12_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB12_2
; GFX1164-NEXT: .LBB12_3:
; GFX1164-NEXT: s_endpgm
@@ -6283,30 +6283,30 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB12_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB12_2
; GFX1132-NEXT: .LBB12_3:
; GFX1132-NEXT: s_endpgm
@@ -6319,23 +6319,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
; GFX9-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_2
; GFX9-DPP-NEXT: .LBB12_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -6348,23 +6348,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_2
; GFX1064-DPP-NEXT: .LBB12_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -6372,27 +6372,27 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_2
; GFX1032-DPP-NEXT: .LBB12_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -6406,25 +6406,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_2
; GFX1164-DPP-NEXT: .LBB12_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -6432,30 +6432,30 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_2
; GFX1132-DPP-NEXT: .LBB12_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -6505,23 +6505,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB13_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB13_2
; GFX9-NEXT: .LBB13_3:
; GFX9-NEXT: s_endpgm
@@ -6534,23 +6534,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB13_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB13_2
; GFX1064-NEXT: .LBB13_3:
; GFX1064-NEXT: s_endpgm
@@ -6558,27 +6558,27 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1032-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032: ; %bb.0:
; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB13_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB13_2
; GFX1032-NEXT: .LBB13_3:
; GFX1032-NEXT: s_endpgm
@@ -6592,25 +6592,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB13_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB13_2
; GFX1164-NEXT: .LBB13_3:
; GFX1164-NEXT: s_endpgm
@@ -6618,30 +6618,30 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1132-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132: ; %bb.0:
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB13_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB13_2
; GFX1132-NEXT: .LBB13_3:
; GFX1132-NEXT: s_endpgm
@@ -6654,23 +6654,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
; GFX9-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX9-DPP-NEXT: .LBB13_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -6683,23 +6683,23 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1064-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1064-DPP-NEXT: .LBB13_3:
; GFX1064-DPP-NEXT: s_endpgm
@@ -6707,27 +6707,27 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1032-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1032-DPP: ; %bb.0:
; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1032-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1032-DPP-NEXT: .LBB13_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -6741,25 +6741,25 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1164-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1164-DPP-NEXT: .LBB13_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -6767,30 +6767,30 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
; GFX1132-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX1132-DPP: ; %bb.0:
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
; GFX1132-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1132-DPP-NEXT: .LBB13_3:
; GFX1132-DPP-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 5cb57703c01d9..1d251f9e47315 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -54,330 +54,330 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
;
; GFX9-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB0_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
-; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s6, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-NEXT: v_mov_b32_e32 v1, s6
; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB0_2
; GFX9-NEXT: .LBB0_3:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB0_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB0_2
; GFX1064-NEXT: .LBB0_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB0_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s4
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB0_2
; GFX1032-NEXT: .LBB0_3:
; GFX1032-NEXT: s_endpgm
;
; GFX1164-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX1164: ; %bb.0:
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-NEXT: s_cbranch_execz .LBB0_3
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1164-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB0_2
; GFX1164-NEXT: .LBB0_3:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1132-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132-NEXT: s_mov_b32 s5, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB0_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1132-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
-; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4
+; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s0
; GFX1132-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB0_2
; GFX1132-NEXT: .LBB0_3:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX9-DPP: ; %bb.0:
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
-; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
+; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s4
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s6, s[2:3], 0x0
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s6
; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX9-DPP-NEXT: .LBB0_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX1064-DPP: ; %bb.0:
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1064-DPP-NEXT: .LBB0_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1032-DPP-NEXT: .LBB0_3:
; GFX1032-DPP-NEXT: s_endpgm
;
; GFX1164-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX1164-DPP: ; %bb.0:
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec
-; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1164-DPP-NEXT: ; %bb.1:
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s1, s[4:5]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1164-DPP-NEXT: .LBB0_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s5, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s1, s5
; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
+; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s1
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4
+; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s0
; GFX1132-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_2
; GFX1132-DPP-NEXT: .LBB0_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -1223,11 +1223,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
@@ -1235,12 +1235,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX9-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB2_2
; GFX9-NEXT: .LBB2_3:
; GFX9-NEXT: s_endpgm
@@ -1262,64 +1262,64 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB2_2
; GFX1064-NEXT: .LBB2_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB2_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032-NEXT: s_mov_b32 s3, 0x43300000
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB2_2
; GFX1032-NEXT: .LBB2_3:
; GFX1032-NEXT: s_endpgm
@@ -1342,27 +1342,27 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB2_2
; GFX1164-NEXT: .LBB2_3:
; GFX1164-NEXT: s_endpgm
@@ -1373,8 +1373,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-NEXT: scratch_store_b32 off, v1, off
@@ -1384,25 +1384,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mul_f32 v2, 4.0, v0
; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB2_2
; GFX1132-NEXT: .LBB2_3:
; GFX1132-NEXT: s_endpgm
@@ -1427,11 +1427,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
@@ -1439,12 +1439,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX9-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX9-DPP-NEXT: .LBB2_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -1466,64 +1466,64 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1064-DPP-NEXT: .LBB2_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1032-DPP-NEXT: .LBB2_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -1546,27 +1546,27 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1164-DPP-NEXT: .LBB2_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -1577,8 +1577,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
@@ -1588,25 +1588,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mul_f32 v2, 4.0, v0
; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2
; GFX1132-DPP-NEXT: .LBB2_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -2453,11 +2453,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
@@ -2465,12 +2465,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX9-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB4_2
; GFX9-NEXT: .LBB4_3:
; GFX9-NEXT: s_endpgm
@@ -2492,64 +2492,64 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB4_2
; GFX1064-NEXT: .LBB4_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB4_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032-NEXT: s_mov_b32 s3, 0x43300000
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB4_2
; GFX1032-NEXT: .LBB4_3:
; GFX1032-NEXT: s_endpgm
@@ -2572,27 +2572,27 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB4_2
; GFX1164-NEXT: .LBB4_3:
; GFX1164-NEXT: s_endpgm
@@ -2603,8 +2603,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-NEXT: scratch_store_b32 off, v1, off
@@ -2614,25 +2614,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mul_f32 v2, 4.0, v0
; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB4_2
; GFX1132-NEXT: .LBB4_3:
; GFX1132-NEXT: s_endpgm
@@ -2657,11 +2657,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
@@ -2669,12 +2669,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX9-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX9-DPP-NEXT: .LBB4_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -2696,64 +2696,64 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1064-DPP-NEXT: .LBB4_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1032-DPP-NEXT: .LBB4_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -2776,27 +2776,27 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1164-DPP-NEXT: .LBB4_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -2807,8 +2807,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
@@ -2818,25 +2818,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mul_f32 v2, 4.0, v0
; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2
; GFX1132-DPP-NEXT: .LBB4_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -4455,11 +4455,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s4
@@ -4467,12 +4467,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
; GFX9-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB7_2
; GFX9-NEXT: .LBB7_3:
; GFX9-NEXT: s_endpgm
@@ -4494,64 +4494,64 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB7_2
; GFX1064-NEXT: .LBB7_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB7_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032-NEXT: s_mov_b32 s3, 0x43300000
; GFX1032-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB7_2
; GFX1032-NEXT: .LBB7_3:
; GFX1032-NEXT: s_endpgm
@@ -4574,27 +4574,27 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB7_2
; GFX1164-NEXT: .LBB7_3:
; GFX1164-NEXT: s_endpgm
@@ -4605,8 +4605,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-NEXT: scratch_store_b32 off, v1, off
@@ -4616,25 +4616,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mul_f32 v2, 4.0, v0
; GFX1132-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB7_2
; GFX1132-NEXT: .LBB7_3:
; GFX1132-NEXT: s_endpgm
@@ -4659,11 +4659,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dword s4, s[2:3], 0x0
; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
@@ -4671,12 +4671,12 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
; GFX9-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_2
; GFX9-DPP-NEXT: .LBB7_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -4698,64 +4698,64 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
; GFX1064-DPP-NEXT: ; %bb.1:
; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_2
; GFX1064-DPP-NEXT: .LBB7_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s2, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
-; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-DPP-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1032-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_2
; GFX1032-DPP-NEXT: .LBB7_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -4778,27 +4778,27 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_2
; GFX1164-DPP-NEXT: .LBB7_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -4809,8 +4809,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
@@ -4820,25 +4820,25 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mul_f32 v2, 4.0, v0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mul_f32 v2, 4.0, v0
; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_sub_f32_e32 v0, v1, v2
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2
; GFX1132-DPP-NEXT: .LBB7_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -5660,101 +5660,101 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
;
; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-NEXT: s_cbranch_execz .LBB9_3
; GFX9-NEXT: ; %bb.1:
-; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX9-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB9_2
; GFX9-NEXT: .LBB9_3:
; GFX9-NEXT: s_endpgm
;
; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB9_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1064-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB9_2
; GFX1064-NEXT: .LBB9_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB9_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s5
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
+; GFX1032-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB9_2
; GFX1032-NEXT: .LBB9_3:
; GFX1032-NEXT: s_endpgm
@@ -5770,165 +5770,165 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1164-NEXT: s_cbranch_execz .LBB9_3
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
-; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB9_2
; GFX1164-NEXT: .LBB9_3:
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1132: ; %bb.0:
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1132-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1132-NEXT: s_mov_b32 s4, exec_lo
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-NEXT: s_cbranch_execz .LBB9_3
; GFX1132-NEXT: ; %bb.1:
-; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
+; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2
; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
-; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB9_2
; GFX1132-NEXT: .LBB9_3:
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX9-DPP: ; %bb.0:
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX9-DPP-NEXT: ; %bb.1:
-; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
+; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX9-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX9-DPP-NEXT: .LBB9_3:
; GFX9-DPP-NEXT: s_endpgm
;
; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1064-DPP: ; %bb.0:
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec
-; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
-; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[4:5]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX1064-DPP-NEXT: .LBB9_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1032-DPP: ; %bb.0:
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s5, exec_lo
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s5
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
-; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
+; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX1032-DPP-NEXT: .LBB9_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -5944,64 +5944,64 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
-; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX1164-DPP-NEXT: .LBB9_3:
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3
; GFX1132-DPP-NEXT: ; %bb.1:
-; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
+; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, s2
; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
-; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2
; GFX1132-DPP-NEXT: .LBB9_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -6557,11 +6557,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
@@ -6569,13 +6569,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX9-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB11_2
; GFX9-NEXT: .LBB11_3:
; GFX9-NEXT: s_endpgm
@@ -6595,68 +6595,68 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB11_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[2:3]
+; GFX1064-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB11_2
; GFX1064-NEXT: .LBB11_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB11_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s2
+; GFX1032-NEXT: s_mov_b32 s7, 0x43300000
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB11_2
; GFX1032-NEXT: .LBB11_3:
; GFX1032-NEXT: s_endpgm
@@ -6679,28 +6679,28 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB11_2
; GFX1164-NEXT: .LBB11_3:
; GFX1164-NEXT: s_endpgm
@@ -6711,8 +6711,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-NEXT: scratch_store_b32 off, v1, off
@@ -6722,25 +6722,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB11_2
; GFX1132-NEXT: .LBB11_3:
; GFX1132-NEXT: s_endpgm
@@ -6765,11 +6765,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
@@ -6777,13 +6777,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX9-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_2
; GFX9-DPP-NEXT: .LBB11_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -6803,68 +6803,68 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[2:3]
+; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_2
; GFX1064-DPP-NEXT: .LBB11_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2
; GFX1032-DPP-NEXT: .LBB11_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -6887,28 +6887,28 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_2
; GFX1164-DPP-NEXT: .LBB11_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -6919,8 +6919,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
@@ -6930,25 +6930,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2
; GFX1132-DPP-NEXT: .LBB11_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -7503,11 +7503,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
@@ -7515,13 +7515,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX9-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB13_2
; GFX9-NEXT: .LBB13_3:
; GFX9-NEXT: s_endpgm
@@ -7541,68 +7541,68 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB13_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[2:3]
+; GFX1064-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB13_2
; GFX1064-NEXT: .LBB13_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB13_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s2
+; GFX1032-NEXT: s_mov_b32 s7, 0x43300000
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB13_2
; GFX1032-NEXT: .LBB13_3:
; GFX1032-NEXT: s_endpgm
@@ -7625,28 +7625,28 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB13_2
; GFX1164-NEXT: .LBB13_3:
; GFX1164-NEXT: s_endpgm
@@ -7657,8 +7657,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-NEXT: scratch_store_b32 off, v1, off
@@ -7668,25 +7668,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB13_2
; GFX1132-NEXT: .LBB13_3:
; GFX1132-NEXT: s_endpgm
@@ -7711,11 +7711,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
@@ -7723,13 +7723,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX9-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX9-DPP-NEXT: .LBB13_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -7749,68 +7749,68 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[2:3]
+; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1064-DPP-NEXT: .LBB13_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1032-DPP-NEXT: .LBB13_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -7833,28 +7833,28 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1164-DPP-NEXT: .LBB13_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -7865,8 +7865,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
@@ -7876,25 +7876,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2
; GFX1132-DPP-NEXT: .LBB13_3:
; GFX1132-DPP-NEXT: s_endpgm
@@ -8927,11 +8927,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s4
@@ -8939,13 +8939,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX9-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-NEXT: s_cbranch_execnz .LBB16_2
; GFX9-NEXT: .LBB16_3:
; GFX9-NEXT: s_endpgm
@@ -8965,68 +8965,68 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB16_3
; GFX1064-NEXT: ; %bb.1:
-; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[2:3]
+; GFX1064-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
; GFX1064-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execnz .LBB16_2
; GFX1064-NEXT: .LBB16_3:
; GFX1064-NEXT: s_endpgm
;
; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp:
; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-NEXT: s_mov_b32 s10, -1
; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB16_3
; GFX1032-NEXT: ; %bb.1:
-; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s2
+; GFX1032-NEXT: s_mov_b32 s7, 0x43300000
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
; GFX1032-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execnz .LBB16_2
; GFX1032-NEXT: .LBB16_3:
; GFX1032-NEXT: s_endpgm
@@ -9049,28 +9049,28 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX1164-NEXT: ; %bb.1:
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-NEXT: s_waitcnt vmcnt(0)
; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-NEXT: s_cbranch_execnz .LBB16_2
; GFX1164-NEXT: .LBB16_3:
; GFX1164-NEXT: s_endpgm
@@ -9081,8 +9081,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-NEXT: s_mov_b32 s2, 0
-; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_mov_b32 s4, 0
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-NEXT: s_clause 0x1
; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-NEXT: scratch_store_b32 off, v1, off
@@ -9092,25 +9092,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX1132-NEXT: ; %bb.1:
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-NEXT: s_waitcnt vmcnt(0)
; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-NEXT: s_cbranch_execnz .LBB16_2
; GFX1132-NEXT: .LBB16_3:
; GFX1132-NEXT: s_endpgm
@@ -9135,11 +9135,11 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; GFX9-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
@@ -9147,13 +9147,13 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX9-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
-; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX9-DPP-NEXT: .LBB16_3:
; GFX9-DPP-NEXT: s_endpgm
@@ -9173,68 +9173,68 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-DPP-NEXT: s_cbranch_execz .LBB16_3
; GFX1064-DPP-NEXT: ; %bb.1:
-; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
-; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[2:3]
+; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX1064-DPP-NEXT: .LBB16_3:
; GFX1064-DPP-NEXT: s_endpgm
;
; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp:
; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
-; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
-; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
; GFX1032-DPP-NEXT: s_cbranch_execz .LBB16_3
; GFX1032-DPP-NEXT: ; %bb.1:
-; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
-; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
-; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s0
; GFX1032-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX1032-DPP-NEXT: .LBB16_3:
; GFX1032-DPP-NEXT: s_endpgm
@@ -9257,28 +9257,28 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX1164-DPP-NEXT: ; %bb.1:
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
-; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
-; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
; GFX1164-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
-; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX1164-DPP-NEXT: .LBB16_3:
; GFX1164-DPP-NEXT: s_endpgm
@@ -9289,8 +9289,8 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
-; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
; GFX1132-DPP-NEXT: s_clause 0x1
; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
@@ -9300,25 +9300,25 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
; GFX1132-DPP-NEXT: ; %bb.1:
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
-; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX1132-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
-; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[2:3] glc
; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB16_2
; GFX1132-DPP-NEXT: .LBB16_3:
; GFX1132-DPP-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
index 297b5180dfe9b..5abd4c9069c91 100644
--- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
@@ -35,8 +35,8 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: ; implicit-def: $vgpr4 : SGPR spill to VGPR lane
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: s_load_dwordx4 s[28:31], s[4:5], 0x0
-; CHECK-NEXT: s_movk_i32 s4, 0x130
-; CHECK-NEXT: s_mov_b32 s5, s24
+; CHECK-NEXT: s_movk_i32 s20, 0x130
+; CHECK-NEXT: s_mov_b32 s21, s24
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_writelane_b32 v4, s36, 0
; CHECK-NEXT: v_writelane_b32 v4, s37, 1
@@ -49,7 +49,7 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: v_writelane_b32 v4, s44, 8
; CHECK-NEXT: v_writelane_b32 v4, s45, 9
; CHECK-NEXT: v_writelane_b32 v4, s46, 10
-; CHECK-NEXT: s_load_dwordx16 s[4:19], s[4:5], 0x0
+; CHECK-NEXT: s_load_dwordx16 s[4:19], s[20:21], 0x0
; CHECK-NEXT: v_writelane_b32 v4, s47, 11
; CHECK-NEXT: v_writelane_b32 v4, s48, 12
; CHECK-NEXT: v_writelane_b32 v4, s49, 13
@@ -78,17 +78,17 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: v_writelane_b32 v4, s13, 25
; CHECK-NEXT: v_writelane_b32 v4, s14, 26
; CHECK-NEXT: v_writelane_b32 v4, s15, 27
-; CHECK-NEXT: v_writelane_b32 v4, s16, 28
; CHECK-NEXT: v_writelane_b32 v8, s52, 18
-; CHECK-NEXT: v_writelane_b32 v4, s17, 29
+; CHECK-NEXT: v_writelane_b32 v4, s16, 28
; CHECK-NEXT: v_writelane_b32 v8, s53, 19
-; CHECK-NEXT: v_writelane_b32 v4, s18, 30
+; CHECK-NEXT: v_writelane_b32 v4, s17, 29
; CHECK-NEXT: v_writelane_b32 v8, s54, 20
-; CHECK-NEXT: v_writelane_b32 v4, s19, 31
-; CHECK-NEXT: s_mov_b32 s4, 48
-; CHECK-NEXT: s_mov_b32 s5, s24
+; CHECK-NEXT: v_writelane_b32 v4, s18, 30
+; CHECK-NEXT: s_mov_b32 s26, 48
+; CHECK-NEXT: s_mov_b32 s27, s24
; CHECK-NEXT: v_writelane_b32 v8, s55, 21
-; CHECK-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
+; CHECK-NEXT: v_writelane_b32 v4, s19, 31
+; CHECK-NEXT: s_load_dwordx8 s[4:11], s[26:27], 0x0
; CHECK-NEXT: v_writelane_b32 v8, s56, 22
; CHECK-NEXT: v_writelane_b32 v8, s57, 23
; CHECK-NEXT: v_writelane_b32 v8, s58, 24
@@ -107,15 +107,15 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: v_writelane_b32 v8, s65, 31
; CHECK-NEXT: v_writelane_b32 v4, s9, 37
; CHECK-NEXT: v_writelane_b32 v8, s66, 32
-; CHECK-NEXT: s_movk_i32 s26, 0x1f0
-; CHECK-NEXT: s_movk_i32 s28, 0x2f0
-; CHECK-NEXT: s_mov_b32 s27, s24
+; CHECK-NEXT: s_movk_i32 s28, 0x1f0
+; CHECK-NEXT: s_movk_i32 s30, 0x2f0
; CHECK-NEXT: s_mov_b32 s29, s24
+; CHECK-NEXT: s_mov_b32 s31, s24
; CHECK-NEXT: v_writelane_b32 v4, s10, 38
; CHECK-NEXT: v_writelane_b32 v8, s67, 33
; CHECK-NEXT: v_writelane_b32 v4, s11, 39
-; CHECK-NEXT: s_load_dwordx16 s[52:67], s[26:27], 0x0
-; CHECK-NEXT: s_load_dwordx16 s[4:19], s[28:29], 0x0
+; CHECK-NEXT: s_load_dwordx16 s[52:67], s[28:29], 0x0
+; CHECK-NEXT: s_load_dwordx16 s[4:19], s[30:31], 0x0
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; CHECK-NEXT: s_xor_b64 s[24:25], vcc, -1
diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
index 7ee31bf4dce7c..c6342e5745832 100644
--- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
+++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
@@ -7,9 +7,9 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9-LABEL: udiv32_invariant_denom:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s7, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX9-NEXT: s_sub_i32 s4, 0, s6
@@ -36,15 +36,15 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9-NEXT: s_add_i32 s10, s11, 1
; GFX9-NEXT: s_cmp_ge_u32 s9, s6
; GFX9-NEXT: s_cselect_b32 s9, s10, s11
-; GFX9-NEXT: s_add_u32 s10, s0, s2
-; GFX9-NEXT: s_addc_u32 s11, s1, s3
+; GFX9-NEXT: s_add_u32 s10, s2, s0
+; GFX9-NEXT: s_addc_u32 s11, s3, s1
; GFX9-NEXT: s_add_i32 s7, s7, 1
; GFX9-NEXT: s_add_u32 s4, s4, s8
; GFX9-NEXT: s_addc_u32 s5, s5, 0
-; GFX9-NEXT: s_add_u32 s2, s2, 4
-; GFX9-NEXT: s_addc_u32 s3, s3, 0
+; GFX9-NEXT: s_add_u32 s0, s0, 4
+; GFX9-NEXT: s_addc_u32 s1, s1, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s9
-; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x1000
+; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x1000
; GFX9-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-NEXT: s_cbranch_scc0 .LBB0_1
; GFX9-NEXT: ; %bb.2: ; %bb2
@@ -52,20 +52,21 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar
;
; GFX10-LABEL: udiv32_invariant_denom:
; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dword s6, s[0:1], 0x2c
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_mov_b32 s7, 0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6
-; GFX10-NEXT: s_sub_i32 s2, 0, s6
+; GFX10-NEXT: s_sub_i32 s0, 0, s6
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX10-NEXT: v_readfirstlane_b32 s4, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: s_mul_i32 s2, s2, s4
-; GFX10-NEXT: s_mul_hi_u32 s5, s4, s2
-; GFX10-NEXT: s_mov_b64 s[2:3], 0
+; GFX10-NEXT: s_mul_i32 s0, s0, s4
+; GFX10-NEXT: s_mul_hi_u32 s5, s4, s0
+; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: s_add_i32 s8, s4, s5
; GFX10-NEXT: s_mov_b64 s[4:5], 0
; GFX10-NEXT: .LBB0_1: ; %bb3
@@ -83,15 +84,15 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX10-NEXT: s_add_i32 s10, s11, 1
; GFX10-NEXT: s_cmp_ge_u32 s9, s6
; GFX10-NEXT: s_cselect_b32 s9, s10, s11
-; GFX10-NEXT: s_add_u32 s10, s0, s2
-; GFX10-NEXT: s_addc_u32 s11, s1, s3
+; GFX10-NEXT: s_add_u32 s10, s2, s0
+; GFX10-NEXT: s_addc_u32 s11, s3, s1
; GFX10-NEXT: s_add_i32 s7, s7, 1
; GFX10-NEXT: s_add_u32 s4, s4, s8
; GFX10-NEXT: v_mov_b32_e32 v1, s9
; GFX10-NEXT: s_addc_u32 s5, s5, 0
-; GFX10-NEXT: s_add_u32 s2, s2, 4
-; GFX10-NEXT: s_addc_u32 s3, s3, 0
-; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x1000
+; GFX10-NEXT: s_add_u32 s0, s0, 4
+; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_cmpk_eq_i32 s0, 0x1000
; GFX10-NEXT: global_store_dword v0, v1, s[10:11]
; GFX10-NEXT: s_cbranch_scc0 .LBB0_1
; GFX10-NEXT: ; %bb.2: ; %bb2
@@ -101,11 +102,11 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_mov_b32 s7, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s6
-; GFX11-NEXT: s_sub_i32 s2, 0, s6
+; GFX11-NEXT: s_sub_i32 s0, 0, s6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
@@ -114,10 +115,10 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_mul_i32 s2, s2, s4
+; GFX11-NEXT: s_mul_i32 s0, s0, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_mul_hi_u32 s5, s4, s2
-; GFX11-NEXT: s_mov_b64 s[2:3], 0
+; GFX11-NEXT: s_mul_hi_u32 s5, s4, s0
+; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_add_i32 s8, s4, s5
; GFX11-NEXT: s_mov_b64 s[4:5], 0
; GFX11-NEXT: .p2align 6
@@ -136,15 +137,15 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: s_add_i32 s10, s11, 1
; GFX11-NEXT: s_cmp_ge_u32 s9, s6
; GFX11-NEXT: s_cselect_b32 s9, s10, s11
-; GFX11-NEXT: s_add_u32 s10, s0, s2
-; GFX11-NEXT: s_addc_u32 s11, s1, s3
+; GFX11-NEXT: s_add_u32 s10, s2, s0
+; GFX11-NEXT: s_addc_u32 s11, s3, s1
; GFX11-NEXT: s_add_i32 s7, s7, 1
; GFX11-NEXT: s_add_u32 s4, s4, s8
; GFX11-NEXT: v_mov_b32_e32 v1, s9
; GFX11-NEXT: s_addc_u32 s5, s5, 0
-; GFX11-NEXT: s_add_u32 s2, s2, 4
-; GFX11-NEXT: s_addc_u32 s3, s3, 0
-; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x1000
+; GFX11-NEXT: s_add_u32 s0, s0, 4
+; GFX11-NEXT: s_addc_u32 s1, s1, 0
+; GFX11-NEXT: s_cmpk_eq_i32 s0, 0x1000
; GFX11-NEXT: global_store_b32 v0, v1, s[10:11]
; GFX11-NEXT: s_cbranch_scc0 .LBB0_1
; GFX11-NEXT: ; %bb.2: ; %bb2
@@ -172,9 +173,9 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9-LABEL: urem32_invariant_denom:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s7, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX9-NEXT: s_sub_i32 s4, 0, s6
@@ -199,15 +200,15 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9-NEXT: s_sub_i32 s10, s9, s6
; GFX9-NEXT: s_cmp_ge_u32 s9, s6
; GFX9-NEXT: s_cselect_b32 s9, s10, s9
-; GFX9-NEXT: s_add_u32 s10, s0, s2
-; GFX9-NEXT: s_addc_u32 s11, s1, s3
+; GFX9-NEXT: s_add_u32 s10, s2, s0
+; GFX9-NEXT: s_addc_u32 s11, s3, s1
; GFX9-NEXT: s_add_i32 s7, s7, 1
; GFX9-NEXT: s_add_u32 s4, s4, s8
; GFX9-NEXT: s_addc_u32 s5, s5, 0
-; GFX9-NEXT: s_add_u32 s2, s2, 4
-; GFX9-NEXT: s_addc_u32 s3, s3, 0
+; GFX9-NEXT: s_add_u32 s0, s0, 4
+; GFX9-NEXT: s_addc_u32 s1, s1, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s9
-; GFX9-NEXT: s_cmpk_eq_i32 s2, 0x1000
+; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x1000
; GFX9-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-NEXT: s_cbranch_scc0 .LBB1_1
; GFX9-NEXT: ; %bb.2: ; %bb2
@@ -215,20 +216,21 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar
;
; GFX10-LABEL: urem32_invariant_denom:
; GFX10: ; %bb.0: ; %bb
+; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dword s6, s[0:1], 0x2c
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_mov_b32 s7, 0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s6
-; GFX10-NEXT: s_sub_i32 s2, 0, s6
+; GFX10-NEXT: s_sub_i32 s0, 0, s6
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX10-NEXT: v_readfirstlane_b32 s4, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: s_mul_i32 s2, s2, s4
-; GFX10-NEXT: s_mul_hi_u32 s5, s4, s2
-; GFX10-NEXT: s_mov_b64 s[2:3], 0
+; GFX10-NEXT: s_mul_i32 s0, s0, s4
+; GFX10-NEXT: s_mul_hi_u32 s5, s4, s0
+; GFX10-NEXT: s_mov_b64 s[0:1], 0
; GFX10-NEXT: s_add_i32 s8, s4, s5
; GFX10-NEXT: s_mov_b64 s[4:5], 0
; GFX10-NEXT: .LBB1_1: ; %bb3
@@ -244,15 +246,15 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX10-NEXT: s_sub_i32 s10, s9, s6
; GFX10-NEXT: s_cmp_ge_u32 s9, s6
; GFX10-NEXT: s_cselect_b32 s9, s10, s9
-; GFX10-NEXT: s_add_u32 s10, s0, s2
-; GFX10-NEXT: s_addc_u32 s11, s1, s3
+; GFX10-NEXT: s_add_u32 s10, s2, s0
+; GFX10-NEXT: s_addc_u32 s11, s3, s1
; GFX10-NEXT: s_add_i32 s7, s7, 1
; GFX10-NEXT: s_add_u32 s4, s4, s8
; GFX10-NEXT: v_mov_b32_e32 v1, s9
; GFX10-NEXT: s_addc_u32 s5, s5, 0
-; GFX10-NEXT: s_add_u32 s2, s2, 4
-; GFX10-NEXT: s_addc_u32 s3, s3, 0
-; GFX10-NEXT: s_cmpk_eq_i32 s2, 0x1000
+; GFX10-NEXT: s_add_u32 s0, s0, 4
+; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_cmpk_eq_i32 s0, 0x1000
; GFX10-NEXT: global_store_dword v0, v1, s[10:11]
; GFX10-NEXT: s_cbranch_scc0 .LBB1_1
; GFX10-NEXT: ; %bb.2: ; %bb2
@@ -262,11 +264,11 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_mov_b32 s7, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s6
-; GFX11-NEXT: s_sub_i32 s2, 0, s6
+; GFX11-NEXT: s_sub_i32 s0, 0, s6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
@@ -275,10 +277,10 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s4, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_mul_i32 s2, s2, s4
+; GFX11-NEXT: s_mul_i32 s0, s0, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_mul_hi_u32 s5, s4, s2
-; GFX11-NEXT: s_mov_b64 s[2:3], 0
+; GFX11-NEXT: s_mul_hi_u32 s5, s4, s0
+; GFX11-NEXT: s_mov_b64 s[0:1], 0
; GFX11-NEXT: s_add_i32 s8, s4, s5
; GFX11-NEXT: s_mov_b64 s[4:5], 0
; GFX11-NEXT: .p2align 6
@@ -296,15 +298,15 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: s_sub_i32 s10, s9, s6
; GFX11-NEXT: s_cmp_ge_u32 s9, s6
; GFX11-NEXT: s_cselect_b32 s9, s10, s9
-; GFX11-NEXT: s_add_u32 s10, s0, s2
-; GFX11-NEXT: s_addc_u32 s11, s1, s3
+; GFX11-NEXT: s_add_u32 s10, s2, s0
+; GFX11-NEXT: s_addc_u32 s11, s3, s1
; GFX11-NEXT: s_add_i32 s7, s7, 1
; GFX11-NEXT: s_add_u32 s4, s4, s8
; GFX11-NEXT: v_mov_b32_e32 v1, s9
; GFX11-NEXT: s_addc_u32 s5, s5, 0
-; GFX11-NEXT: s_add_u32 s2, s2, 4
-; GFX11-NEXT: s_addc_u32 s3, s3, 0
-; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x1000
+; GFX11-NEXT: s_add_u32 s0, s0, 4
+; GFX11-NEXT: s_addc_u32 s1, s1, 0
+; GFX11-NEXT: s_cmpk_eq_i32 s0, 0x1000
; GFX11-NEXT: global_store_b32 v0, v1, s[10:11]
; GFX11-NEXT: s_cbranch_scc0 .LBB1_1
; GFX11-NEXT: ; %bb.2: ; %bb2
@@ -331,14 +333,14 @@ bb3: ; preds = %bb3, %bb
define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %arg, i32 %arg1) {
; GFX9-LABEL: sdiv32_invariant_denom:
; GFX9: ; %bb.0: ; %bb
-; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GFX9-NEXT: s_mov_b32 s3, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dword s5, s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_abs_i32 s2, s4
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX9-NEXT: s_sub_i32 s5, 0, s2
-; GFX9-NEXT: s_ashr_i32 s4, s4, 31
+; GFX9-NEXT: s_abs_i32 s4, s5
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4
+; GFX9-NEXT: s_ashr_i32 s1, s5, 31
+; GFX9-NEXT: s_sub_i32 s5, 0, s4
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
@@ -349,70 +351,70 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: .LBB2_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_mul_hi_u32 s6, s3, s5
-; GFX9-NEXT: s_mul_i32 s7, s6, s2
-; GFX9-NEXT: s_sub_i32 s7, s3, s7
+; GFX9-NEXT: s_mul_hi_u32 s6, s0, s5
+; GFX9-NEXT: s_mul_i32 s7, s6, s4
+; GFX9-NEXT: s_sub_i32 s7, s0, s7
; GFX9-NEXT: s_add_i32 s8, s6, 1
-; GFX9-NEXT: s_sub_i32 s9, s7, s2
-; GFX9-NEXT: s_cmp_ge_u32 s7, s2
+; GFX9-NEXT: s_sub_i32 s9, s7, s4
+; GFX9-NEXT: s_cmp_ge_u32 s7, s4
; GFX9-NEXT: s_cselect_b32 s6, s8, s6
; GFX9-NEXT: s_cselect_b32 s7, s9, s7
; GFX9-NEXT: s_add_i32 s8, s6, 1
-; GFX9-NEXT: s_cmp_ge_u32 s7, s2
+; GFX9-NEXT: s_cmp_ge_u32 s7, s4
; GFX9-NEXT: s_cselect_b32 s6, s8, s6
-; GFX9-NEXT: s_xor_b32 s6, s6, s4
-; GFX9-NEXT: s_sub_i32 s6, s6, s4
-; GFX9-NEXT: s_add_i32 s3, s3, 1
+; GFX9-NEXT: s_xor_b32 s6, s6, s1
+; GFX9-NEXT: s_sub_i32 s6, s6, s1
+; GFX9-NEXT: s_add_i32 s0, s0, 1
; GFX9-NEXT: v_mov_b32_e32 v1, s6
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX9-NEXT: s_add_u32 s0, s0, 4
-; GFX9-NEXT: s_addc_u32 s1, s1, 0
-; GFX9-NEXT: s_cmpk_eq_i32 s3, 0x400
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT: s_add_u32 s2, s2, 4
+; GFX9-NEXT: s_addc_u32 s3, s3, 0
+; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x400
; GFX9-NEXT: s_cbranch_scc0 .LBB2_1
; GFX9-NEXT: ; %bb.2: ; %bb2
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sdiv32_invariant_denom:
; GFX10: ; %bb.0: ; %bb
-; GFX10-NEXT: s_load_dword s3, s[0:1], 0x2c
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_load_dword s5, s[0:1], 0x2c
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_abs_i32 s2, s3
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX10-NEXT: s_sub_i32 s4, 0, s2
-; GFX10-NEXT: s_ashr_i32 s3, s3, 31
+; GFX10-NEXT: s_abs_i32 s4, s5
+; GFX10-NEXT: s_ashr_i32 s0, s5, 31
+; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4
+; GFX10-NEXT: s_sub_i32 s1, 0, s4
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s5, v0
+; GFX10-NEXT: v_readfirstlane_b32 s6, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: s_mul_i32 s4, s4, s5
-; GFX10-NEXT: s_mul_hi_u32 s6, s5, s4
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: s_add_i32 s5, s5, s6
+; GFX10-NEXT: s_mul_i32 s1, s1, s6
+; GFX10-NEXT: s_mul_hi_u32 s5, s6, s1
+; GFX10-NEXT: s_mov_b32 s1, 0
+; GFX10-NEXT: s_add_i32 s5, s6, s5
; GFX10-NEXT: .LBB2_1: ; %bb3
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_mul_hi_u32 s6, s4, s5
-; GFX10-NEXT: s_mul_i32 s7, s6, s2
+; GFX10-NEXT: s_mul_hi_u32 s6, s1, s5
+; GFX10-NEXT: s_mul_i32 s7, s6, s4
; GFX10-NEXT: s_add_i32 s8, s6, 1
-; GFX10-NEXT: s_sub_i32 s7, s4, s7
-; GFX10-NEXT: s_sub_i32 s9, s7, s2
-; GFX10-NEXT: s_cmp_ge_u32 s7, s2
+; GFX10-NEXT: s_sub_i32 s7, s1, s7
+; GFX10-NEXT: s_sub_i32 s9, s7, s4
+; GFX10-NEXT: s_cmp_ge_u32 s7, s4
; GFX10-NEXT: s_cselect_b32 s6, s8, s6
; GFX10-NEXT: s_cselect_b32 s7, s9, s7
; GFX10-NEXT: s_add_i32 s8, s6, 1
-; GFX10-NEXT: s_cmp_ge_u32 s7, s2
+; GFX10-NEXT: s_cmp_ge_u32 s7, s4
; GFX10-NEXT: s_cselect_b32 s6, s8, s6
-; GFX10-NEXT: s_add_i32 s4, s4, 1
-; GFX10-NEXT: s_xor_b32 s6, s6, s3
-; GFX10-NEXT: s_sub_i32 s6, s6, s3
+; GFX10-NEXT: s_add_i32 s1, s1, 1
+; GFX10-NEXT: s_xor_b32 s6, s6, s0
+; GFX10-NEXT: s_sub_i32 s6, s6, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s6
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_add_u32 s0, s0, 4
-; GFX10-NEXT: s_addc_u32 s1, s1, 0
-; GFX10-NEXT: s_cmpk_eq_i32 s4, 0x400
+; GFX10-NEXT: s_add_u32 s2, s2, 4
+; GFX10-NEXT: s_addc_u32 s3, s3, 0
+; GFX10-NEXT: s_cmpk_eq_i32 s1, 0x400
; GFX10-NEXT: s_cbranch_scc0 .LBB2_1
; GFX10-NEXT: ; %bb.2: ; %bb2
; GFX10-NEXT: s_endpgm
@@ -420,51 +422,51 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-LABEL: sdiv32_invariant_denom:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s5, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_abs_i32 s2, s3
-; GFX11-NEXT: s_ashr_i32 s3, s3, 31
-; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX11-NEXT: s_sub_i32 s4, 0, s2
+; GFX11-NEXT: s_abs_i32 s4, s5
+; GFX11-NEXT: s_ashr_i32 s0, s5, 31
+; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s4
+; GFX11-NEXT: s_sub_i32 s1, 0, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_readfirstlane_b32 s5, v0
+; GFX11-NEXT: v_readfirstlane_b32 s6, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_mul_i32 s4, s4, s5
+; GFX11-NEXT: s_mul_i32 s1, s1, s6
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_mul_hi_u32 s6, s5, s4
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: s_add_i32 s5, s5, s6
+; GFX11-NEXT: s_mul_hi_u32 s5, s6, s1
+; GFX11-NEXT: s_mov_b32 s1, 0
+; GFX11-NEXT: s_add_i32 s5, s6, s5
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB2_1: ; %bb3
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_mul_hi_u32 s6, s4, s5
-; GFX11-NEXT: s_mul_i32 s7, s6, s2
+; GFX11-NEXT: s_mul_hi_u32 s6, s1, s5
+; GFX11-NEXT: s_mul_i32 s7, s6, s4
; GFX11-NEXT: s_add_i32 s8, s6, 1
-; GFX11-NEXT: s_sub_i32 s7, s4, s7
+; GFX11-NEXT: s_sub_i32 s7, s1, s7
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_sub_i32 s9, s7, s2
-; GFX11-NEXT: s_cmp_ge_u32 s7, s2
+; GFX11-NEXT: s_sub_i32 s9, s7, s4
+; GFX11-NEXT: s_cmp_ge_u32 s7, s4
; GFX11-NEXT: s_cselect_b32 s6, s8, s6
; GFX11-NEXT: s_cselect_b32 s7, s9, s7
; GFX11-NEXT: s_add_i32 s8, s6, 1
-; GFX11-NEXT: s_cmp_ge_u32 s7, s2
+; GFX11-NEXT: s_cmp_ge_u32 s7, s4
; GFX11-NEXT: s_cselect_b32 s6, s8, s6
-; GFX11-NEXT: s_add_i32 s4, s4, 1
-; GFX11-NEXT: s_xor_b32 s6, s6, s3
+; GFX11-NEXT: s_add_i32 s1, s1, 1
+; GFX11-NEXT: s_xor_b32 s6, s6, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_sub_i32 s6, s6, s3
+; GFX11-NEXT: s_sub_i32 s6, s6, s0
; GFX11-NEXT: v_mov_b32_e32 v1, s6
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_add_u32 s0, s0, 4
-; GFX11-NEXT: s_addc_u32 s1, s1, 0
-; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x400
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-NEXT: s_add_u32 s2, s2, 4
+; GFX11-NEXT: s_addc_u32 s3, s3, 0
+; GFX11-NEXT: s_cmpk_eq_i32 s1, 0x400
; GFX11-NEXT: s_cbranch_scc0 .LBB2_1
; GFX11-NEXT: ; %bb.2: ; %bb2
; GFX11-NEXT: s_nop 0
@@ -491,37 +493,38 @@ define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9-LABEL: srem32_invariant_denom:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT: s_mov_b32 s3, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_abs_i32 s2, s2
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX9-NEXT: s_sub_i32 s4, 0, s2
+; GFX9-NEXT: s_abs_i32 s4, s2
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_sub_i32 s1, 0, s4
+; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX9-NEXT: v_readfirstlane_b32 s5, v0
-; GFX9-NEXT: s_mul_i32 s4, s4, s5
-; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4
-; GFX9-NEXT: s_add_i32 s4, s5, s4
+; GFX9-NEXT: s_mul_i32 s1, s1, s5
+; GFX9-NEXT: s_mul_hi_u32 s1, s5, s1
+; GFX9-NEXT: s_add_i32 s1, s5, s1
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: .LBB3_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_mul_hi_u32 s5, s3, s4
-; GFX9-NEXT: s_mul_i32 s5, s5, s2
-; GFX9-NEXT: s_sub_i32 s5, s3, s5
-; GFX9-NEXT: s_sub_i32 s6, s5, s2
-; GFX9-NEXT: s_cmp_ge_u32 s5, s2
+; GFX9-NEXT: s_mul_hi_u32 s5, s0, s1
+; GFX9-NEXT: s_mul_i32 s5, s5, s4
+; GFX9-NEXT: s_sub_i32 s5, s0, s5
+; GFX9-NEXT: s_sub_i32 s6, s5, s4
+; GFX9-NEXT: s_cmp_ge_u32 s5, s4
; GFX9-NEXT: s_cselect_b32 s5, s6, s5
-; GFX9-NEXT: s_sub_i32 s6, s5, s2
-; GFX9-NEXT: s_cmp_ge_u32 s5, s2
+; GFX9-NEXT: s_sub_i32 s6, s5, s4
+; GFX9-NEXT: s_cmp_ge_u32 s5, s4
; GFX9-NEXT: s_cselect_b32 s5, s6, s5
-; GFX9-NEXT: s_add_i32 s3, s3, 1
+; GFX9-NEXT: s_add_i32 s0, s0, 1
; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX9-NEXT: s_add_u32 s0, s0, 4
-; GFX9-NEXT: s_addc_u32 s1, s1, 0
-; GFX9-NEXT: s_cmpk_eq_i32 s3, 0x400
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT: s_add_u32 s2, s2, 4
+; GFX9-NEXT: s_addc_u32 s3, s3, 0
+; GFX9-NEXT: s_cmpk_eq_i32 s0, 0x400
; GFX9-NEXT: s_cbranch_scc0 .LBB3_1
; GFX9-NEXT: ; %bb.2: ; %bb2
; GFX9-NEXT: s_endpgm
@@ -530,85 +533,85 @@ define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX10: ; %bb.0: ; %bb
; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_abs_i32 s2, s2
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX10-NEXT: s_sub_i32 s3, 0, s2
+; GFX10-NEXT: s_abs_i32 s4, s2
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4
+; GFX10-NEXT: s_sub_i32 s0, 0, s4
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s4, v0
+; GFX10-NEXT: v_readfirstlane_b32 s1, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: s_mul_i32 s3, s3, s4
-; GFX10-NEXT: s_mul_hi_u32 s5, s4, s3
-; GFX10-NEXT: s_mov_b32 s3, 0
-; GFX10-NEXT: s_add_i32 s4, s4, s5
+; GFX10-NEXT: s_mul_i32 s0, s0, s1
+; GFX10-NEXT: s_mul_hi_u32 s5, s1, s0
+; GFX10-NEXT: s_mov_b32 s0, 0
+; GFX10-NEXT: s_add_i32 s1, s1, s5
; GFX10-NEXT: .LBB3_1: ; %bb3
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_mul_hi_u32 s5, s3, s4
-; GFX10-NEXT: s_mul_i32 s5, s5, s2
-; GFX10-NEXT: s_sub_i32 s5, s3, s5
-; GFX10-NEXT: s_sub_i32 s6, s5, s2
-; GFX10-NEXT: s_cmp_ge_u32 s5, s2
+; GFX10-NEXT: s_mul_hi_u32 s5, s0, s1
+; GFX10-NEXT: s_mul_i32 s5, s5, s4
+; GFX10-NEXT: s_sub_i32 s5, s0, s5
+; GFX10-NEXT: s_sub_i32 s6, s5, s4
+; GFX10-NEXT: s_cmp_ge_u32 s5, s4
; GFX10-NEXT: s_cselect_b32 s5, s6, s5
-; GFX10-NEXT: s_sub_i32 s6, s5, s2
-; GFX10-NEXT: s_cmp_ge_u32 s5, s2
+; GFX10-NEXT: s_sub_i32 s6, s5, s4
+; GFX10-NEXT: s_cmp_ge_u32 s5, s4
; GFX10-NEXT: s_cselect_b32 s5, s6, s5
-; GFX10-NEXT: s_add_i32 s3, s3, 1
+; GFX10-NEXT: s_add_i32 s0, s0, 1
; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_add_u32 s0, s0, 4
-; GFX10-NEXT: s_addc_u32 s1, s1, 0
-; GFX10-NEXT: s_cmpk_eq_i32 s3, 0x400
+; GFX10-NEXT: s_add_u32 s2, s2, 4
+; GFX10-NEXT: s_addc_u32 s3, s3, 0
+; GFX10-NEXT: s_cmpk_eq_i32 s0, 0x400
; GFX10-NEXT: s_cbranch_scc0 .LBB3_1
; GFX10-NEXT: ; %bb.2: ; %bb2
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: srem32_invariant_denom:
; GFX11: ; %bb.0: ; %bb
-; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_abs_i32 s2, s2
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX11-NEXT: s_sub_i32 s3, 0, s2
+; GFX11-NEXT: s_abs_i32 s4, s2
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s4
+; GFX11-NEXT: s_sub_i32 s0, 0, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_readfirstlane_b32 s1, v0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_mul_i32 s3, s3, s4
-; GFX11-NEXT: s_mul_hi_u32 s5, s4, s3
-; GFX11-NEXT: s_mov_b32 s3, 0
-; GFX11-NEXT: s_add_i32 s4, s4, s5
+; GFX11-NEXT: s_mul_i32 s0, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_mul_hi_u32 s5, s1, s0
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_add_i32 s1, s1, s5
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB3_1: ; %bb3
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_mul_hi_u32 s5, s3, s4
-; GFX11-NEXT: s_mul_i32 s5, s5, s2
+; GFX11-NEXT: s_mul_hi_u32 s5, s0, s1
+; GFX11-NEXT: s_mul_i32 s5, s5, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_sub_i32 s5, s3, s5
-; GFX11-NEXT: s_sub_i32 s6, s5, s2
-; GFX11-NEXT: s_cmp_ge_u32 s5, s2
+; GFX11-NEXT: s_sub_i32 s5, s0, s5
+; GFX11-NEXT: s_sub_i32 s6, s5, s4
+; GFX11-NEXT: s_cmp_ge_u32 s5, s4
; GFX11-NEXT: s_cselect_b32 s5, s6, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_sub_i32 s6, s5, s2
-; GFX11-NEXT: s_cmp_ge_u32 s5, s2
+; GFX11-NEXT: s_sub_i32 s6, s5, s4
+; GFX11-NEXT: s_cmp_ge_u32 s5, s4
; GFX11-NEXT: s_cselect_b32 s5, s6, s5
-; GFX11-NEXT: s_add_i32 s3, s3, 1
+; GFX11-NEXT: s_add_i32 s0, s0, 1
; GFX11-NEXT: v_mov_b32_e32 v1, s5
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: s_add_u32 s0, s0, 4
-; GFX11-NEXT: s_addc_u32 s1, s1, 0
-; GFX11-NEXT: s_cmpk_eq_i32 s3, 0x400
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-NEXT: s_add_u32 s2, s2, 4
+; GFX11-NEXT: s_addc_u32 s3, s3, 0
+; GFX11-NEXT: s_cmpk_eq_i32 s0, 0x400
; GFX11-NEXT: s_cbranch_scc0 .LBB3_1
; GFX11-NEXT: ; %bb.2: ; %bb2
; GFX11-NEXT: s_nop 0
@@ -748,12 +751,12 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_movk_i32 s3, 0x400
; GFX9-NEXT: v_mov_b32_e32 v3, 1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
+; GFX9-NEXT: s_and_b32 s4, s2, 0xffff
+; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_movk_i32 s0, 0x400
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
; GFX9-NEXT: .LBB5_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -765,11 +768,12 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v0
; GFX9-NEXT: v_lshlrev_b32_sdwa v5, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v6, vcc
-; GFX9-NEXT: v_mul_lo_u32 v4, v4, s2
+; GFX9-NEXT: v_mul_lo_u32 v4, v4, s4
; GFX9-NEXT: v_sub_u32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_add_u16_e32 v2, 1, v2
-; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v2
-; GFX9-NEXT: global_store_short v5, v4, s[0:1]
+; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s0, v2
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_short v5, v4, s[2:3]
; GFX9-NEXT: s_cbranch_vccz .LBB5_1
; GFX9-NEXT: ; %bb.2: ; %bb2
; GFX9-NEXT: s_endpgm
@@ -807,13 +811,13 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-LABEL: urem16_invariant_denom:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX11-NEXT: s_and_b32 s0, s4, 0xffff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2
+; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s0
; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB5_1: ; %bb3
@@ -833,10 +837,10 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v5, vcc_lo
; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 1, v3
-; GFX11-NEXT: v_mul_lo_u32 v4, v4, s2
+; GFX11-NEXT: v_mul_lo_u32 v4, v4, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_sub_nc_u32_e32 v3, v3, v4
-; GFX11-NEXT: global_store_b16 v5, v3, s[0:1]
+; GFX11-NEXT: global_store_b16 v5, v3, s[2:3]
; GFX11-NEXT: s_cbranch_vccz .LBB5_1
; GFX11-NEXT: ; %bb.2: ; %bb2
; GFX11-NEXT: s_nop 0
@@ -863,18 +867,18 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9-LABEL: sdiv16_invariant_denom:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT: s_mov_b32 s4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_movk_i32 s3, 0x400
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sext_i32_i16 s2, s2
-; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2
+; GFX9-NEXT: s_sext_i32_i16 s4, s2
+; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s1, 0
+; GFX9-NEXT: s_movk_i32 s0, 0x400
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
; GFX9-NEXT: .LBB6_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_sext_i32_i16 s5, s4
+; GFX9-NEXT: s_sext_i32_i16 s5, s1
; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s5
-; GFX9-NEXT: s_xor_b32 s6, s5, s2
+; GFX9-NEXT: s_xor_b32 s6, s5, s4
; GFX9-NEXT: s_ashr_i32 s5, s6, 30
; GFX9-NEXT: s_or_b32 s5, s5, 1
; GFX9-NEXT: v_mul_f32_e32 v4, v3, v1
@@ -883,15 +887,16 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4
; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v3|, |v0|
; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec
-; GFX9-NEXT: v_add_u16_e64 v2, s4, 1
+; GFX9-NEXT: v_add_u16_e64 v2, s1, 1
; GFX9-NEXT: s_cselect_b32 s5, s5, 0
-; GFX9-NEXT: s_and_b32 s6, 0xffff, s4
-; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v2
-; GFX9-NEXT: v_readfirstlane_b32 s4, v2
+; GFX9-NEXT: s_and_b32 s6, 0xffff, s1
+; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s0, v2
+; GFX9-NEXT: v_readfirstlane_b32 s1, v2
; GFX9-NEXT: v_add_u32_e32 v2, s5, v4
; GFX9-NEXT: s_lshl_b32 s5, s6, 1
; GFX9-NEXT: v_mov_b32_e32 v3, s5
-; GFX9-NEXT: global_store_short v3, v2, s[0:1]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_short v3, v2, s[2:3]
; GFX9-NEXT: s_cbranch_vccz .LBB6_1
; GFX9-NEXT: ; %bb.2: ; %bb2
; GFX9-NEXT: s_endpgm
@@ -935,21 +940,21 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-LABEL: sdiv16_invariant_denom:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s3, 0
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sext_i32_i16 s2, s2
+; GFX11-NEXT: s_sext_i32_i16 s0, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2
+; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s0
; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB6_1: ; %bb3
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_sext_i32_i16 s4, s3
-; GFX11-NEXT: v_add_nc_u16 v2, s3, 1
+; GFX11-NEXT: s_sext_i32_i16 s4, s1
+; GFX11-NEXT: v_add_nc_u16 v2, s1, 1
; GFX11-NEXT: v_cvt_f32_i32_e32 v3, s4
-; GFX11-NEXT: s_xor_b32 s5, s4, s2
+; GFX11-NEXT: s_xor_b32 s5, s4, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: s_ashr_i32 s4, s5, 30
; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
@@ -964,12 +969,12 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: v_cmp_ge_f32_e64 s5, |v3|, |v0|
; GFX11-NEXT: s_and_b32 s5, s5, exec_lo
; GFX11-NEXT: s_cselect_b32 s4, s4, 0
-; GFX11-NEXT: s_and_b32 s5, 0xffff, s3
-; GFX11-NEXT: v_readfirstlane_b32 s3, v2
+; GFX11-NEXT: s_and_b32 s5, 0xffff, s1
+; GFX11-NEXT: v_readfirstlane_b32 s1, v2
; GFX11-NEXT: s_lshl_b32 s5, s5, 1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v3, s5 :: v_dual_add_nc_u32 v2, s4, v4
-; GFX11-NEXT: global_store_b16 v3, v2, s[0:1]
+; GFX11-NEXT: global_store_b16 v3, v2, s[2:3]
; GFX11-NEXT: s_cbranch_vccz .LBB6_1
; GFX11-NEXT: ; %bb.2: ; %bb2
; GFX11-NEXT: s_nop 0
@@ -996,18 +1001,18 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9-LABEL: srem16_invariant_denom:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT: s_mov_b32 s4, 0
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_movk_i32 s3, 0x400
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sext_i32_i16 s2, s2
-; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2
+; GFX9-NEXT: s_sext_i32_i16 s4, s2
+; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s1, 0
+; GFX9-NEXT: s_movk_i32 s0, 0x400
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
; GFX9-NEXT: .LBB7_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT: s_sext_i32_i16 s5, s4
+; GFX9-NEXT: s_sext_i32_i16 s5, s1
; GFX9-NEXT: v_cvt_f32_i32_e32 v3, s5
-; GFX9-NEXT: s_xor_b32 s6, s5, s2
+; GFX9-NEXT: s_xor_b32 s6, s5, s4
; GFX9-NEXT: s_ashr_i32 s6, s6, 30
; GFX9-NEXT: s_or_b32 s8, s6, 1
; GFX9-NEXT: v_mul_f32_e32 v4, v3, v1
@@ -1016,17 +1021,19 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX9-NEXT: v_cvt_i32_f32_e32 v4, v4
; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v3|, |v0|
; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec
-; GFX9-NEXT: v_add_u16_e64 v2, s4, 1
+; GFX9-NEXT: v_add_u16_e64 v2, s1, 1
; GFX9-NEXT: s_cselect_b32 s6, s8, 0
-; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v2
-; GFX9-NEXT: s_and_b32 s7, 0xffff, s4
-; GFX9-NEXT: v_readfirstlane_b32 s4, v2
+; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s0, v2
+; GFX9-NEXT: s_and_b32 s7, 0xffff, s1
+; GFX9-NEXT: v_readfirstlane_b32 s1, v2
; GFX9-NEXT: v_add_u32_e32 v2, s6, v4
-; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2
+; GFX9-NEXT: v_mul_lo_u32 v2, v2, s4
; GFX9-NEXT: s_lshl_b32 s6, s7, 1
+; GFX9-NEXT: s_and_b64 vcc, exec, vcc
; GFX9-NEXT: v_mov_b32_e32 v3, s6
; GFX9-NEXT: v_sub_u32_e32 v2, s5, v2
-; GFX9-NEXT: global_store_short v3, v2, s[0:1]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_short v3, v2, s[2:3]
; GFX9-NEXT: s_cbranch_vccz .LBB7_1
; GFX9-NEXT: ; %bb.2: ; %bb2
; GFX9-NEXT: s_endpgm
@@ -1073,21 +1080,21 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-LABEL: srem16_invariant_denom:
; GFX11: ; %bb.0: ; %bb
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s3, 0
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sext_i32_i16 s2, s2
+; GFX11-NEXT: s_sext_i32_i16 s0, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2
+; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s0
; GFX11-NEXT: v_rcp_iflag_f32_e32 v1, v0
; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB7_1: ; %bb3
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_sext_i32_i16 s4, s3
-; GFX11-NEXT: v_add_nc_u16 v2, s3, 1
+; GFX11-NEXT: s_sext_i32_i16 s4, s1
+; GFX11-NEXT: v_add_nc_u16 v2, s1, 1
; GFX11-NEXT: v_cvt_f32_i32_e32 v3, s4
-; GFX11-NEXT: s_xor_b32 s5, s4, s2
+; GFX11-NEXT: s_xor_b32 s5, s4, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: s_ashr_i32 s5, s5, 30
; GFX11-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
@@ -1105,14 +1112,14 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_add_nc_u32_e32 v3, s5, v3
-; GFX11-NEXT: s_and_b32 s5, 0xffff, s3
-; GFX11-NEXT: v_readfirstlane_b32 s3, v2
+; GFX11-NEXT: s_and_b32 s5, 0xffff, s1
+; GFX11-NEXT: v_readfirstlane_b32 s1, v2
; GFX11-NEXT: s_lshl_b32 s5, s5, 1
; GFX11-NEXT: v_mov_b32_e32 v2, s5
-; GFX11-NEXT: v_mul_lo_u32 v3, v3, s2
+; GFX11-NEXT: v_mul_lo_u32 v3, v3, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_sub_nc_u32_e32 v3, s4, v3
-; GFX11-NEXT: global_store_b16 v2, v3, s[0:1]
+; GFX11-NEXT: global_store_b16 v2, v3, s[2:3]
; GFX11-NEXT: s_cbranch_vccz .LBB7_1
; GFX11-NEXT: ; %bb.2: ; %bb2
; GFX11-NEXT: s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll
index 9da07ea04ded5..06a58163a9080 100644
--- a/llvm/test/CodeGen/AMDGPU/idot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot2.ll
@@ -41,7 +41,7 @@ define amdgpu_kernel void @udot2(ptr addrspace(1) %src1,
; GFX8-LABEL: udot2:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -52,7 +52,7 @@ define amdgpu_kernel void @udot2(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
@@ -60,10 +60,10 @@ define amdgpu_kernel void @udot2(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s2
+; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0
; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -102,18 +102,19 @@ define amdgpu_kernel void @udot2(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot2:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2
-; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
+; GFX10-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -179,7 +180,7 @@ define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1,
; GFX8-LABEL: udot2_MulMul:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -190,7 +191,7 @@ define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -198,9 +199,9 @@ define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_mad_u32_u24 v0, v0, v2, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -242,21 +243,22 @@ define amdgpu_kernel void @udot2_MulMul(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot2_MulMul:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_add3_u32 v0, v1, v0, s2
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_add3_u32 v0, v1, v0, s0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -317,7 +319,7 @@ define amdgpu_kernel void @idot2(ptr addrspace(1) %src1,
; GFX8-LABEL: idot2:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -328,7 +330,7 @@ define amdgpu_kernel void @idot2(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16
; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3
@@ -336,10 +338,10 @@ define amdgpu_kernel void @idot2(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s2
+; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0
; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -378,18 +380,19 @@ define amdgpu_kernel void @idot2(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot2:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot2_i32_i16 v1, v2, v1, s2
-; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT: v_dot2_i32_i16 v1, v2, v1, s0
+; GFX10-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -451,7 +454,7 @@ define amdgpu_kernel void @idot2_MixedTypedMul(ptr addrspace(1) %src1,
; GFX8-LABEL: idot2_MixedTypedMul:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -462,7 +465,7 @@ define amdgpu_kernel void @idot2_MixedTypedMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
@@ -470,10 +473,10 @@ define amdgpu_kernel void @idot2_MixedTypedMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s2
+; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0
; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -515,21 +518,22 @@ define amdgpu_kernel void @idot2_MixedTypedMul(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot2_MixedTypedMul:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v0, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_add3_u32 v0, v1, s2, v0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -591,7 +595,7 @@ define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1,
; GFX8-LABEL: udot2_alt_AddOperands:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -602,7 +606,7 @@ define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
@@ -610,10 +614,10 @@ define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s2
+; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0
; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -656,18 +660,19 @@ define amdgpu_kernel void @udot2_alt_AddOperands(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot2_alt_AddOperands:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2
-; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
+; GFX10-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -729,7 +734,7 @@ define amdgpu_kernel void @idot2_MixedExt(ptr addrspace(1) %src1,
; GFX8-LABEL: idot2_MixedExt:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -740,7 +745,7 @@ define amdgpu_kernel void @idot2_MixedExt(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16
; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3
@@ -748,10 +753,10 @@ define amdgpu_kernel void @idot2_MixedExt(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s2
+; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0
; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -793,21 +798,22 @@ define amdgpu_kernel void @idot2_MixedExt(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot2_MixedExt:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v0, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_add3_u32 v0, v1, s2, v0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -867,7 +873,7 @@ define amdgpu_kernel void @notudot2_SameVec(ptr addrspace(1) %src1,
; GFX8-LABEL: notudot2_SameVec:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -878,16 +884,16 @@ define amdgpu_kernel void @notudot2_SameVec(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v0, s2
+; GFX8-NEXT: v_mad_u32_u24 v0, v0, v0, s0
; GFX8-NEXT: v_mad_u32_u24 v2, v1, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -931,22 +937,23 @@ define amdgpu_kernel void @notudot2_SameVec(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: notudot2_SameVec:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_add3_u32 v0, v1, s2, v0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -1009,7 +1016,7 @@ define amdgpu_kernel void @udot2_v4i16(ptr addrspace(1) %src1,
; GFX8-LABEL: udot2_v4i16:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1020,7 +1027,7 @@ define amdgpu_kernel void @udot2_v4i16(ptr addrspace(1) %src1,
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: flat_load_dword v1, v[2:3]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -1028,10 +1035,10 @@ define amdgpu_kernel void @udot2_v4i16(ptr addrspace(1) %src1,
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v1, v0, s2
+; GFX8-NEXT: v_mad_u32_u24 v0, v1, v0, s0
; GFX8-NEXT: v_mad_u32_u24 v2, v3, v2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1070,18 +1077,19 @@ define amdgpu_kernel void @udot2_v4i16(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot2_v4i16:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2
-; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
+; GFX10-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -1143,7 +1151,7 @@ define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1,
; GFX8-LABEL: udot2_v4i16_Hi:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1158,7 +1166,7 @@ define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v2
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
@@ -1166,10 +1174,10 @@ define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v2, s2
+; GFX8-NEXT: v_mad_u32_u24 v0, v0, v2, s0
; GFX8-NEXT: v_mad_u32_u24 v2, v3, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1208,18 +1216,19 @@ define amdgpu_kernel void @udot2_v4i16_Hi(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot2_v4i16_Hi:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] offset:4
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7] offset:4
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s2
-; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT: v_dot2_u32_u16 v1, v2, v1, s0
+; GFX10-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -1282,7 +1291,7 @@ define amdgpu_kernel void @notudot2_v4i16_Even(ptr addrspace(1) %src1,
; GFX8-LABEL: notudot2_v4i16_Even:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1293,7 +1302,7 @@ define amdgpu_kernel void @notudot2_v4i16_Even(ptr addrspace(1) %src1,
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -1301,10 +1310,10 @@ define amdgpu_kernel void @notudot2_v4i16_Even(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v1, v3, v1, s2
+; GFX8-NEXT: v_mad_u32_u24 v1, v3, v1, s0
; GFX8-NEXT: v_mad_u32_u24 v2, v2, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1346,21 +1355,22 @@ define amdgpu_kernel void @notudot2_v4i16_Even(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: notudot2_v4i16_Even:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
; GFX10-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_add3_u32 v0, v1, s2, v0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -1423,7 +1433,7 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(ptr addrspace(1) %src1,
; GFX8-LABEL: notudot2_v4i16_Middle:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1434,7 +1444,7 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(ptr addrspace(1) %src1,
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -1442,10 +1452,10 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(ptr addrspace(1) %src1,
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v1, v3, v1, s2
+; GFX8-NEXT: v_mad_u32_u24 v1, v3, v1, s0
; GFX8-NEXT: v_mad_u32_u24 v2, v2, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1487,21 +1497,22 @@ define amdgpu_kernel void @notudot2_v4i16_Middle(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: notudot2_v4i16_Middle:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
; GFX10-DL-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_add3_u32 v0, v1, s2, v0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -1563,7 +1574,7 @@ define amdgpu_kernel void @notudot2_DiffIndex(ptr addrspace(1) %src1,
; GFX8-LABEL: notudot2_DiffIndex:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1574,7 +1585,7 @@ define amdgpu_kernel void @notudot2_DiffIndex(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
@@ -1582,10 +1593,10 @@ define amdgpu_kernel void @notudot2_DiffIndex(ptr addrspace(1) %src1,
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s2
+; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0
; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1627,21 +1638,22 @@ define amdgpu_kernel void @notudot2_DiffIndex(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: notudot2_DiffIndex:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_1
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_add3_u32 v0, v1, s2, v0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_add3_u32 v0, v1, s0, v0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -1704,7 +1716,7 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX8-LABEL: udot2_MultipleUses_add1:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1715,7 +1727,7 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
@@ -1723,11 +1735,11 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s2
+; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0
; GFX8-NEXT: v_mad_u32_u24 v1, v2, v1, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1773,14 +1785,15 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot2_MultipleUses_add1:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@@ -1788,9 +1801,9 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s2
+; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s0
; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -1855,7 +1868,7 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX8-LABEL: idot2_MultipleUses_add1:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1866,7 +1879,7 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16
; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3
@@ -1874,11 +1887,11 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s2
+; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0
; GFX8-NEXT: v_mad_i32_i24 v1, v2, v1, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1924,14 +1937,15 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot2_MultipleUses_add1:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_ashrrev_i32_e32 v0, 16, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@@ -1939,9 +1953,9 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s2
+; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s0
; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -2006,7 +2020,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1,
; GFX8-LABEL: udot2_MultipleUses_mul1:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -2017,7 +2031,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
@@ -2025,11 +2039,11 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v4, v2, v1, s2
+; GFX8-NEXT: v_mad_u32_u24 v4, v2, v1, s0
; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, v4
; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -2079,14 +2093,15 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot2_MultipleUses_mul1:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_and_b32_e32 v0, 0xffff, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@@ -2094,10 +2109,10 @@ define amdgpu_kernel void @udot2_MultipleUses_mul1(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s2
+; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: v_add3_u32 v0, v1, v0, v2
-; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -2163,7 +2178,7 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1,
; GFX8-LABEL: idot2_MultipleUses_mul1:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -2174,7 +2189,7 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16
; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3
@@ -2182,11 +2197,11 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v4, v2, v1, s2
+; GFX8-NEXT: v_mad_i32_i24 v4, v2, v1, s0
; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, v4
; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -2236,14 +2251,15 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot2_MultipleUses_mul1:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_bfe_i32 v0, v1, 0, 16
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@@ -2251,10 +2267,10 @@ define amdgpu_kernel void @idot2_MultipleUses_mul1(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; GFX10-DL-NEXT: v_mul_i32_i24_e32 v2, v3, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s2
+; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: v_add3_u32 v0, v1, v0, v2
-; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -2320,7 +2336,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX8-LABEL: udot2_MultipleUses_mul2:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -2331,7 +2347,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v3
@@ -2339,11 +2355,11 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v4, v0, v3, s2
+; GFX8-NEXT: v_mad_u32_u24 v4, v0, v3, s0
; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, v4
; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -2391,14 +2407,15 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot2_MultipleUses_mul2:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@@ -2406,10 +2423,10 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s2
+; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: v_add3_u32 v0, v2, v0, v1
-; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -2475,7 +2492,7 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX8-LABEL: idot2_MultipleUses_mul2:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -2486,7 +2503,7 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 16
; GFX8-NEXT: v_ashrrev_i32_e32 v3, 16, v3
@@ -2494,11 +2511,11 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v4, v0, v3, s2
+; GFX8-NEXT: v_mad_i32_i24 v4, v0, v3, s0
; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, v4
; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -2546,14 +2563,15 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot2_MultipleUses_mul2:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_ashrrev_i32_e32 v0, 16, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@@ -2561,10 +2579,10 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mul_i32_i24_e32 v2, v3, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s2
+; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: v_add3_u32 v0, v2, v0, v1
-; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -2629,7 +2647,7 @@ define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1,
; GFX8-LABEL: udot2_acc16:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -2640,8 +2658,8 @@ define amdgpu_kernel void @udot2_acc16(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
@@ -2770,7 +2788,7 @@ define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1,
; GFX8-LABEL: notsdot2_sext8:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -2781,7 +2799,7 @@ define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8
; GFX8-NEXT: v_lshrrev_b16_e32 v3, 8, v3
@@ -2791,10 +2809,10 @@ define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1,
; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0
; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s2
+; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0
; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -2840,23 +2858,24 @@ define amdgpu_kernel void @notsdot2_sext8(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: notsdot2_sext8:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_ushort v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_ushort v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc0c0001
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc0c0001
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-DL-NEXT: v_mov_b32_e32 v2, s0
; GFX10-DL-NEXT: v_dot4c_i32_i8 v2, v1, v0
-; GFX10-DL-NEXT: global_store_dword v3, v2, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v2, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index fdd913867c8f8..c148ba3a60b1c 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -45,7 +45,7 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -56,7 +56,7 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8
; GFX8-NEXT: v_bfe_i32 v4, v3, 8, 8
@@ -66,14 +66,14 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8
; GFX8-NEXT: v_bfe_i32 v5, v0, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s2
+; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0
; GFX8-NEXT: v_bfe_i32 v7, v0, 16, 8
; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 24, v0
; GFX8-NEXT: v_mad_i32_i24 v1, v6, v7, v1
; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -115,36 +115,38 @@ define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v1, v2
-; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, s2 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, s0 neg_lo:[1,1,0]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -236,7 +238,7 @@ define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc16:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -247,8 +249,8 @@ define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_bfe_i32 v7, v3, 0, 8
@@ -344,16 +346,16 @@ define amdgpu_kernel void @idot4_acc16(ptr addrspace(1) %src1,
; GFX11-DL: ; %bb.0: ; %entry
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: global_load_i16 v3, v1, s[0:1]
+; GFX11-DL-NEXT: global_load_i16 v3, v1, s[2:3]
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v2, v0, v3 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b16 v1, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -437,7 +439,7 @@ define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc8:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -448,8 +450,8 @@ define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3
@@ -529,16 +531,16 @@ define amdgpu_kernel void @idot4_acc8(ptr addrspace(1) %src1,
; GFX11-DL: ; %bb.0: ; %entry
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: global_load_u8 v3, v1, s[0:1]
+; GFX11-DL-NEXT: global_load_u8 v3, v1, s[2:3]
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, v3
-; GFX11-DL-NEXT: global_store_b8 v1, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b8 v1, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -615,7 +617,7 @@ define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_multiuse_mul1:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -626,7 +628,7 @@ define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8
; GFX8-NEXT: v_bfe_i32 v4, v3, 8, 8
@@ -636,15 +638,15 @@ define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8
; GFX8-NEXT: v_bfe_i32 v5, v0, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v8, v1, v2, s2
+; GFX8-NEXT: v_mad_i32_i24 v8, v1, v2, s0
; GFX8-NEXT: v_mad_i32_i24 v4, v4, v5, v8
; GFX8-NEXT: v_bfe_i32 v7, v0, 16, 8
; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, v4
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 24, v0
; GFX8-NEXT: v_mad_i32_i24 v1, v6, v7, v1
; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -695,45 +697,47 @@ define amdgpu_kernel void @idot4_multiuse_mul1(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_multiuse_mul1:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_bfe_i32 v0, v1, 0, 8
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_bfe_i32 v3, v2, 0, 8
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mad_i32_i24 v0, v0, v3, s2
+; GFX10-DL-NEXT: v_mad_i32_i24 v0, v0, v3, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v1, v2
-; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_multiuse_mul1:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_bfe_i32 v2, v1, 0, 8
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_bfe_i32 v3, v0, 0, 8
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-DL-NEXT: v_mad_i32_i24 v2, v2, v3, s2
+; GFX11-DL-NEXT: v_mad_i32_i24 v2, v2, v3, s0
; GFX11-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, v2 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v3, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -819,7 +823,7 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_vecMul:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -830,7 +834,7 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v3
; GFX8-NEXT: v_ashrrev_i32_e32 v4, 24, v3
@@ -844,12 +848,12 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v0, v3, v0, s2
+; GFX8-NEXT: v_mad_i32_i24 v0, v3, v0, s0
; GFX8-NEXT: v_mad_i32_i24 v0, v1, v2, v0
; GFX8-NEXT: v_mad_i32_i24 v0, v5, v7, v0
; GFX8-NEXT: v_mad_i32_i24 v2, v4, v6, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -894,36 +898,38 @@ define amdgpu_kernel void @idot4_acc32_vecMul(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v1, v2
-; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_vecMul:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, s2 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, s0 neg_lo:[1,1,0]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -1001,7 +1007,7 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc16_vecMul:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1012,8 +1018,8 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
@@ -1111,15 +1117,16 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc16_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_ushort v3, v0, s[2:3]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_ashrrev_i16 v4, 8, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
@@ -1145,20 +1152,21 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1
; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3
-; GFX10-DL-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc16_vecMul:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: global_load_u16 v3, v2, s[0:1]
+; GFX11-DL-NEXT: global_load_u16 v3, v2, s[2:3]
; GFX11-DL-NEXT: s_waitcnt vmcnt(2)
; GFX11-DL-NEXT: v_ashrrev_i16 v4, 8, v1
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
@@ -1190,7 +1198,7 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-DL-NEXT: v_add_nc_u16 v0, v1, v0
; GFX11-DL-NEXT: v_add_nc_u16 v0, v0, v3
-; GFX11-DL-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b16 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -1254,7 +1262,7 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_2ele:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1265,7 +1273,7 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8
; GFX8-NEXT: v_bfe_i32 v3, v3, 8, 8
@@ -1273,10 +1281,10 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8
; GFX8-NEXT: v_bfe_i32 v0, v0, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s2
+; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0
; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1320,44 +1328,46 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_2ele:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc0c0100
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc0c0100
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-DL-NEXT: v_mov_b32_e32 v2, s0
; GFX10-DL-NEXT: v_dot4c_i32_i8 v2, v1, v0
-; GFX10-DL-NEXT: global_store_dword v3, v2, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v2, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_2ele:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc0c0100
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s2 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -1425,7 +1435,7 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_3ele:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1436,7 +1446,7 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8
; GFX8-NEXT: v_bfe_i32 v4, v3, 8, 8
@@ -1445,12 +1455,12 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8
; GFX8-NEXT: v_bfe_i32 v5, v0, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s2
+; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0
; GFX8-NEXT: v_bfe_i32 v0, v0, 16, 8
; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1
; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1498,44 +1508,46 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_3ele:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc020100
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc020100
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-DL-NEXT: v_mov_b32_e32 v2, s0
; GFX10-DL-NEXT: v_dot4c_i32_i8 v2, v1, v0
-; GFX10-DL-NEXT: global_store_dword v3, v2, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v2, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_3ele:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc020100
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc020100
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s2 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -1610,7 +1622,7 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_3ele_permuted:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1621,7 +1633,7 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_ashrrev_i32_e32 v1, 24, v3
; GFX8-NEXT: v_bfe_i32 v4, v3, 0, 8
@@ -1630,12 +1642,12 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
; GFX8-NEXT: v_ashrrev_i32_e32 v2, 24, v0
; GFX8-NEXT: v_bfe_i32 v5, v0, 0, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s2
+; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0
; GFX8-NEXT: v_bfe_i32 v0, v0, 16, 8
; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1
; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1683,44 +1695,46 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_3ele_permuted:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc020003
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc020003
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-DL-NEXT: v_mov_b32_e32 v2, s0
; GFX10-DL-NEXT: v_dot4c_i32_i8 v2, v1, v0
-; GFX10-DL-NEXT: global_store_dword v3, v2, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v2, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_3ele_permuted:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc020003
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc020003
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s2 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -1795,7 +1809,7 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_opt:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1806,8 +1820,8 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v4, v3, 0, 8
; GFX8-NEXT: v_bfe_i32 v7, v3, 16, 8
@@ -1860,9 +1874,10 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_opt:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
@@ -1871,14 +1886,15 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v1, v2
-; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_opt:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
@@ -1886,7 +1902,7 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, 0 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -1970,22 +1986,22 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1,
;
; GFX8-LABEL: idot4_acc32_3src:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s9
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s8, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8
; GFX8-NEXT: v_bfe_i32 v2, v3, 8, 8
@@ -2001,20 +2017,20 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1,
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 24, v0
; GFX8-NEXT: v_mad_i32_i24 v1, v5, v6, v1
; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NEXT: v_mov_b32_e32 v1, s11
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-NODL-LABEL: idot4_acc32_3src:
; GFX9-NODL: ; %bb.0: ; %entry
-; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
-; GFX9-NODL-NEXT: global_load_dword v3, v0, s[4:5]
-; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT: global_load_dword v3, v0, s[8:9]
+; GFX9-NODL-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v4, sext(v1), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
@@ -2026,20 +2042,20 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_add3_u32 v2, v4, s0, v2
; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1
-; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: idot4_acc32_3src:
; GFX9-DL: ; %bb.0: ; %entry
-; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX9-DL-NEXT: global_load_dword v3, v0, s[0:1]
-; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-DL-NEXT: s_mov_b32 s0, 0x706010c
; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0c00
+; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[8:9]
+; GFX9-DL-NEXT: global_load_dword v3, v0, s[4:5]
+; GFX9-DL-NEXT: s_load_dword s1, s[10:11], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
; GFX9-DL-NEXT: v_perm_b32 v1, v2, v1, s0
@@ -2048,20 +2064,19 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_or_b32_e32 v1, v1, v2
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v3, v1, s1
-; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot4_acc32_3src:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x2
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT: global_load_dword v3, v0, s[0:1]
-; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[8:9]
+; GFX10-DL-NEXT: global_load_dword v3, v0, s[4:5]
+; GFX10-DL-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0x706010c
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@@ -2071,19 +2086,19 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s0
; GFX10-DL-NEXT: v_dot4c_i32_i8 v1, v3, v0
-; GFX10-DL-NEXT: global_store_dword v2, v1, s[6:7]
+; GFX10-DL-NEXT: global_store_dword v2, v1, s[10:11]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_3src:
; GFX11-DL: ; %bb.0: ; %entry
-; GFX11-DL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x2
-; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5]
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1]
-; GFX11-DL-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-DL-NEXT: global_load_b32 v2, v0, s[8:9]
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5]
+; GFX11-DL-NEXT: s_load_b32 s0, s[10:11], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v2, v1, 0x706010c
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
@@ -2093,7 +2108,7 @@ define amdgpu_kernel void @idot4_acc32_3src(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[6:7]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[10:11]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -2177,22 +2192,22 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1,
;
; GFX8-LABEL: idot4_acc32_3src_3ele:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s9
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s8, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8
; GFX8-NEXT: v_bfe_i32 v2, v3, 8, 8
@@ -2205,20 +2220,20 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1,
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v0, v0, 16, 8
; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NEXT: v_mov_b32_e32 v1, s11
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-NODL-LABEL: idot4_acc32_3src_3ele:
; GFX9-NODL: ; %bb.0: ; %entry
-; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
-; GFX9-NODL-NEXT: global_load_dword v3, v0, s[4:5]
-; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT: global_load_dword v3, v0, s[8:9]
+; GFX9-NODL-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
; GFX9-NODL-NEXT: v_bfe_i32 v4, v1, 0, 8
@@ -2229,21 +2244,21 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_mad_i32_i24 v3, v4, v4, s0
; GFX9-NODL-NEXT: v_add3_u32 v1, v3, v2, v1
-; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: idot4_acc32_3src_3ele:
; GFX9-DL: ; %bb.0: ; %entry
-; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX9-DL-NEXT: global_load_dword v3, v0, s[0:1]
-; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: s_mov_b32 s0, 0xc06010c
; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0c00
; GFX9-DL-NEXT: s_mov_b32 s2, 0xc020100
+; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[8:9]
+; GFX9-DL-NEXT: global_load_dword v3, v0, s[4:5]
+; GFX9-DL-NEXT: s_load_dword s3, s[10:11], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
; GFX9-DL-NEXT: v_perm_b32 v1, v2, v1, s0
@@ -2253,20 +2268,19 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s2
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s3
-; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot4_acc32_3src_3ele:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x2
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT: global_load_dword v3, v0, s[0:1]
-; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[8:9]
+; GFX10-DL-NEXT: global_load_dword v3, v0, s[4:5]
+; GFX10-DL-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0xc06010c
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@@ -2277,19 +2291,19 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s0
; GFX10-DL-NEXT: v_dot4c_i32_i8 v1, v2, v0
-; GFX10-DL-NEXT: global_store_dword v3, v1, s[6:7]
+; GFX10-DL-NEXT: global_store_dword v3, v1, s[10:11]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_3src_3ele:
; GFX11-DL: ; %bb.0: ; %entry
-; GFX11-DL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x2
-; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5]
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1]
-; GFX11-DL-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-DL-NEXT: global_load_b32 v2, v0, s[8:9]
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5]
+; GFX11-DL-NEXT: s_load_b32 s0, s[10:11], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v2, v1, 0xc06010c
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
@@ -2300,7 +2314,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[6:7]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[10:11]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -2378,9 +2392,9 @@ define amdgpu_kernel void @idot4_bad_source(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_bad_source:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x3c
+; GFX8-NEXT: s_load_dword s8, s[0:1], 0x3c
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -2390,122 +2404,122 @@ define amdgpu_kernel void @idot4_bad_source(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0
-; GFX8-NEXT: s_sext_i32_i16 s2, s2
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX8-NEXT: s_sext_i32_i16 s1, s8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v2, v3, 0, 8
; GFX8-NEXT: v_bfe_i32 v4, v3, 8, 8
-; GFX8-NEXT: v_mad_i32_i24 v1, v2, s2, v1
+; GFX8-NEXT: v_mad_i32_i24 v1, v2, s1, v1
; GFX8-NEXT: v_bfe_i32 v3, v3, 16, 8
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v5, v0, 8, 8
; GFX8-NEXT: v_bfe_i32 v0, v0, 16, 8
; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1
; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-NODL-LABEL: idot4_bad_source:
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x3c
+; GFX9-NODL-NEXT: s_load_dword s8, s[0:1], 0x3c
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX9-NODL-NEXT: s_sext_i32_i16 s2, s2
+; GFX9-NODL-NEXT: s_sext_i32_i16 s1, s8
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NODL-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 8
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v4, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3
-; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, s2, v2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, s1, v2
; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v4, v1
-; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: idot4_bad_source:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x3c
+; GFX9-DL-NEXT: s_load_dword s8, s[0:1], 0x3c
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
+; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0201
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX9-DL-NEXT: s_mov_b32 s4, 0xc0c0201
-; GFX9-DL-NEXT: s_sext_i32_i16 s2, s2
-; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX9-DL-NEXT: s_sext_i32_i16 s4, s8
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
; GFX9-DL-NEXT: v_bfe_i32 v4, v1, 0, 8
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s4
-; GFX9-DL-NEXT: v_mad_i32_i24 v3, v4, s2, v3
-; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s4
+; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1
+; GFX9-DL-NEXT: v_mad_i32_i24 v3, v4, s4, v3
+; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1
; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v2, v3
-; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot4_bad_source:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x3c
+; GFX10-DL-NEXT: s_load_dword s8, s[0:1], 0x3c
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_sext_i32_i16 s2, s2
-; GFX10-DL-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX10-DL-NEXT: s_sext_i32_i16 s1, s8
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_bfe_i32 v0, v1, 0, 8
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_perm_b32 v2, v2, v2, 0xc0c0201
; GFX10-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0201
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mad_i32_i24 v0, v0, s2, s3
+; GFX10-DL-NEXT: v_mad_i32_i24 v0, v0, s1, s0
; GFX10-DL-NEXT: v_dot4c_i32_i8 v0, v1, v2
-; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_bad_source:
; GFX11-DL: ; %bb.0: ; %entry
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x3c
+; GFX11-DL-NEXT: s_load_b32 s8, s[0:1], 0x3c
; GFX11-DL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x44
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x44
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: s_sext_i32_i16 s2, s2
-; GFX11-DL-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX11-DL-NEXT: s_sext_i32_i16 s1, s8
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_bfe_i32 v2, v1, 0, 8
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc0c0201
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0201
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DL-NEXT: v_mad_i32_i24 v2, v2, s2, s3
+; GFX11-DL-NEXT: v_mad_i32_i24 v2, v2, s1, s0
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v1, v0, v2 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v3, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -2586,7 +2600,7 @@ define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_commutative:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -2597,7 +2611,7 @@ define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8
; GFX8-NEXT: v_bfe_i32 v4, v3, 8, 8
@@ -2606,12 +2620,12 @@ define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8
; GFX8-NEXT: v_bfe_i32 v5, v0, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s2
+; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0
; GFX8-NEXT: v_bfe_i32 v0, v0, 16, 8
; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1
; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -2659,44 +2673,46 @@ define amdgpu_kernel void @idot4_commutative(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_commutative:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc020100
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc020100
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-DL-NEXT: v_mov_b32_e32 v2, s0
; GFX10-DL-NEXT: v_dot4c_i32_i8 v2, v1, v0
-; GFX10-DL-NEXT: global_store_dword v3, v2, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v2, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_commutative:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x3c
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x3c
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc020100
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc020100
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s2 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -2776,22 +2792,22 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
;
; GFX8-LABEL: idot4_acc32_3src_3ele_src0:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s9
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s8, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_bfe_i32 v2, v3, 8, 8
; GFX8-NEXT: s_waitcnt vmcnt(1)
@@ -2803,20 +2819,20 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v0, v0, 16, 8
; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NEXT: v_mov_b32_e32 v1, s11
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-NODL-LABEL: idot4_acc32_3src_3ele_src0:
; GFX9-NODL: ; %bb.0: ; %entry
-; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX9-NODL-NEXT: global_load_dword v3, v0, s[0:1]
-; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[8:9]
+; GFX9-NODL-NEXT: global_load_dword v3, v0, s[4:5]
+; GFX9-NODL-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
; GFX9-NODL-NEXT: v_bfe_i32 v4, v1, 8, 8
@@ -2827,21 +2843,21 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_mad_i32_i24 v3, v4, v4, s0
; GFX9-NODL-NEXT: v_add3_u32 v1, v3, v2, v1
-; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: idot4_acc32_3src_3ele_src0:
; GFX9-DL: ; %bb.0: ; %entry
-; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
-; GFX9-DL-NEXT: global_load_dword v3, v0, s[2:3]
-; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: s_mov_b32 s0, 0xc06010c
; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0c01
; GFX9-DL-NEXT: s_mov_b32 s2, 0xc020101
+; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7]
+; GFX9-DL-NEXT: s_load_dword s3, s[10:11], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
; GFX9-DL-NEXT: v_perm_b32 v1, v1, v2, s0
@@ -2851,20 +2867,19 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s2
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v2, v1, s3
-; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot4_acc32_3src_3ele_src0:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x2
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
-; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3]
-; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[8:9]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7]
+; GFX10-DL-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v1, v2, 0xc06010c
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@@ -2875,19 +2890,19 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mov_b32_e32 v1, s0
; GFX10-DL-NEXT: v_dot4c_i32_i8 v1, v2, v0
-; GFX10-DL-NEXT: global_store_dword v3, v1, s[6:7]
+; GFX10-DL-NEXT: global_store_dword v3, v1, s[10:11]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_3src_3ele_src0:
; GFX11-DL: ; %bb.0: ; %entry
-; GFX11-DL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x2
-; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
-; GFX11-DL-NEXT: global_load_b32 v2, v0, s[0:1]
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3]
-; GFX11-DL-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-DL-NEXT: global_load_b32 v1, v0, s[8:9]
+; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5]
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT: s_load_b32 s0, s[10:11], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v2, 0xc06010c
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
@@ -2898,7 +2913,7 @@ define amdgpu_kernel void @idot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v1, s0 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[6:7]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[10:11]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -2986,7 +3001,7 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1,
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -3004,12 +3019,12 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s10, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(3)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 8
; GFX8-NEXT: v_bfe_i32 v2, v3, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s2
+; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_bfe_i32 v3, v4, 0, 8
; GFX8-NEXT: v_bfe_i32 v4, v4, 8, 8
@@ -3022,8 +3037,8 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v7, v0, 0, 8
; GFX8-NEXT: v_bfe_i32 v0, v0, 8, 8
; GFX8-NEXT: v_mad_i32_i24 v2, v7, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -3031,14 +3046,14 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1,
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX9-NODL-NEXT: global_load_dword v3, v0, s[8:9]
; GFX9-NODL-NEXT: global_load_dword v4, v0, s[10:11]
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(3)
; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v1, sext(v1), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
@@ -3048,52 +3063,52 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v4, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v2
+; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v2
; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v4
-; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: idot4_4src:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
-; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0501
-; GFX9-DL-NEXT: s_mov_b32 s3, 0x5010c0c
+; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
+; GFX9-DL-NEXT: s_mov_b32 s0, 0xc0c0501
+; GFX9-DL-NEXT: s_mov_b32 s1, 0x5010c0c
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX9-DL-NEXT: global_load_dword v3, v0, s[8:9]
; GFX9-DL-NEXT: global_load_dword v4, v0, s[10:11]
; GFX9-DL-NEXT: s_mov_b32 s4, 0xc0c0400
-; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0
+; GFX9-DL-NEXT: s_load_dword s6, s[2:3], 0x0
; GFX9-DL-NEXT: s_mov_b32 s5, 0x4000c0c
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT: v_perm_b32 v5, v2, v1, s2
+; GFX9-DL-NEXT: v_perm_b32 v5, v2, v1, s0
; GFX9-DL-NEXT: v_perm_b32 v1, v2, v1, s4
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_perm_b32 v6, v4, v3, s3
+; GFX9-DL-NEXT: v_perm_b32 v6, v4, v3, s1
; GFX9-DL-NEXT: v_perm_b32 v2, v4, v3, s5
; GFX9-DL-NEXT: v_or_b32_e32 v3, v6, v5
; GFX9-DL-NEXT: v_or_b32_e32 v1, v2, v1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_dot4_i32_i8 v1, v1, v3, s6
-; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: idot4_4src:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x3
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: global_load_dword v3, v0, s[8:9]
; GFX10-DL-NEXT: global_load_dword v4, v0, s[10:11]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0xc0c0501
; GFX10-DL-NEXT: v_perm_b32 v1, v2, v1, 0xc0c0400
@@ -3104,23 +3119,23 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_or_b32_e32 v0, v5, v0
; GFX10-DL-NEXT: v_or_b32_e32 v1, v2, v1
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-DL-NEXT: v_mov_b32_e32 v2, s0
; GFX10-DL-NEXT: v_dot4c_i32_i8 v2, v1, v0
-; GFX10-DL-NEXT: global_store_dword v3, v2, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v2, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_4src:
; GFX11-DL: ; %bb.0: ; %entry
; GFX11-DL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x44
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x44
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x3
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v2, v0, s[6:7]
; GFX11-DL-NEXT: global_load_b32 v3, v0, s[8:9]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[10:11]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(2)
; GFX11-DL-NEXT: v_perm_b32 v4, v2, v1, 0xc0c0501
; GFX11-DL-NEXT: v_perm_b32 v1, v2, v1, 0xc0c0400
@@ -3133,8 +3148,8 @@ define amdgpu_kernel void @idot4_4src(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v2, s2 neg_lo:[1,1,0]
-; GFX11-DL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_i32_iu8 v0, v0, v2, s0 neg_lo:[1,1,0]
+; GFX11-DL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -3231,7 +3246,7 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_nonstandard_signed:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v4, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -3243,8 +3258,8 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
@@ -3327,10 +3342,11 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_nonstandard_signed:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v6, 0xff
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
@@ -3355,14 +3371,15 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_mad_u16 v0, v1, v2, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX10-DL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_nonstandard_signed:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
@@ -3390,7 +3407,7 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-DL-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX11-DL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index 0b131ea74f1ab..86aab8cb54278 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -45,7 +45,7 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_acc32:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -56,7 +56,7 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8
@@ -66,14 +66,14 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0
; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s2
+; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s0
; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8
; GFX8-NEXT: v_mad_u32_u24 v1, v4, v5, v1
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX8-NEXT: v_mad_u32_u24 v1, v6, v7, v1
; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -115,34 +115,36 @@ define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot4_acc32:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s2
-; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0
+; GFX10-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot4_acc32:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -227,7 +229,7 @@ define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_acc16:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v5, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -239,8 +241,8 @@ define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_and_b32_e32 v6, 0xff, v3
@@ -329,16 +331,16 @@ define amdgpu_kernel void @udot4_acc16(ptr addrspace(1) %src1,
; GFX11-DL: ; %bb.0: ; %entry
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: global_load_u16 v3, v1, s[0:1]
+; GFX11-DL-NEXT: global_load_u16 v3, v1, s[2:3]
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, v3
-; GFX11-DL-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b16 v1, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -423,7 +425,7 @@ define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_acc8:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -434,8 +436,8 @@ define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3
@@ -515,16 +517,16 @@ define amdgpu_kernel void @udot4_acc8(ptr addrspace(1) %src1,
; GFX11-DL: ; %bb.0: ; %entry
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: global_load_u8 v3, v1, s[0:1]
+; GFX11-DL-NEXT: global_load_u8 v3, v1, s[2:3]
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v2, v0, v3
-; GFX11-DL-NEXT: global_store_b8 v1, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b8 v1, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -595,7 +597,7 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1,
; GFX8-LABEL: udot2_8:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -606,8 +608,8 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v3
@@ -684,14 +686,14 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1,
; GFX11-DL: ; %bb.0: ; %entry
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: global_load_u8 v3, v2, s[0:1]
+; GFX11-DL-NEXT: global_load_u8 v3, v2, s[2:3]
; GFX11-DL-NEXT: s_waitcnt vmcnt(2)
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
@@ -699,7 +701,7 @@ define amdgpu_kernel void @udot2_8(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, v3
-; GFX11-DL-NEXT: global_store_b8 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b8 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -765,7 +767,7 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_CommutationInsideMAD:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -776,8 +778,8 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3
@@ -857,16 +859,16 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(ptr addrspace(1) %src1,
; GFX11-DL: ; %bb.0: ; %entry
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: global_load_u8 v3, v1, s[0:1]
+; GFX11-DL-NEXT: global_load_u8 v3, v1, s[2:3]
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, v3
-; GFX11-DL-NEXT: global_store_b8 v1, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b8 v1, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -943,7 +945,7 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_CommutationAccrossMADs:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -954,8 +956,8 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3
@@ -1035,16 +1037,16 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(ptr addrspace(1) %src1,
; GFX11-DL: ; %bb.0: ; %entry
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: global_load_u8 v3, v1, s[0:1]
+; GFX11-DL-NEXT: global_load_u8 v3, v1, s[2:3]
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, v3
-; GFX11-DL-NEXT: global_store_b8 v1, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b8 v1, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -1122,7 +1124,7 @@ define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_multiuse_mul1:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1133,7 +1135,7 @@ define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8
@@ -1143,15 +1145,15 @@ define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0
; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v8, v1, v2, s2
+; GFX8-NEXT: v_mad_u32_u24 v8, v1, v2, s0
; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, v8
; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8
; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, v4
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX8-NEXT: v_mad_u32_u24 v1, v6, v7, v1
; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1202,45 +1204,47 @@ define amdgpu_kernel void @udot4_multiuse_mul1(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot4_multiuse_mul1:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_and_b32_e32 v0, 0xff, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xff, v2
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v3, s2
+; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v3, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v2, v0
-; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot4_multiuse_mul1:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_and_b32_e32 v2, 0xff, v1
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_and_b32_e32 v3, 0xff, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-DL-NEXT: v_mad_u32_u24 v2, v2, v3, s2
+; GFX11-DL-NEXT: v_mad_u32_u24 v2, v2, v3, s0
; GFX11-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, v2
-; GFX11-DL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v3, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -1328,7 +1332,7 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_multiuse_add1:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -1339,7 +1343,7 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
@@ -1349,16 +1353,16 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8
; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, s2
+; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, s0
; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8
; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, v4
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX8-NEXT: v_mad_u32_u24 v1, v6, v7, v1
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, s2, v4
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s0, v4
; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v5
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1408,46 +1412,48 @@ define amdgpu_kernel void @udot4_multiuse_add1(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot4_multiuse_add1:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v0, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s2
-; GFX10-DL-NEXT: s_add_i32 s2, s2, s2
+; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0
+; GFX10-DL-NEXT: s_add_i32 s0, s0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-DL-NEXT: v_add3_u32 v0, s2, v0, v1
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_add3_u32 v0, s0, v0, v1
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot4_multiuse_add1:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_bfe_u32 v2, v1, 8, 8
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_bfe_u32 v3, v0, 8, 8
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
; GFX11-DL-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-DL-NEXT: s_add_i32 s2, s2, s2
+; GFX11-DL-NEXT: s_add_i32 s0, s0, s0
; GFX11-DL-NEXT: v_mul_u32_u24_e32 v2, v2, v3
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT: v_add3_u32 v0, s2, v2, v0
-; GFX11-DL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-DL-NEXT: v_add3_u32 v0, s0, v2, v0
+; GFX11-DL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -1535,7 +1541,7 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
; GFX8-LABEL: notdot4_mixedtypes:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v5, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -1547,8 +1553,8 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3
@@ -1663,7 +1669,7 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
; GFX11-DL: ; %bb.0: ; %entry
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
@@ -1678,7 +1684,7 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_bfe_i32 v7, v0, 0, 8
; GFX11-DL-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX11-DL-NEXT: v_and_b32_e32 v5, 0xff, v5
-; GFX11-DL-NEXT: global_load_u16 v3, v2, s[0:1]
+; GFX11-DL-NEXT: global_load_u16 v3, v2, s[2:3]
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc0c0302
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0302
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
@@ -1688,7 +1694,7 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, v3
-; GFX11-DL-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b16 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -1778,7 +1784,7 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1,
; GFX8-LABEL: notdot4_mixedtypes2:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v5, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -1790,8 +1796,8 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3
@@ -1920,7 +1926,7 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1,
; GFX11-DL: ; %bb.0: ; %entry
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
@@ -1931,7 +1937,7 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v1
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_and_b32_e32 v9, 0xff, v0
-; GFX11-DL-NEXT: global_load_u16 v3, v2, s[0:1]
+; GFX11-DL-NEXT: global_load_u16 v3, v2, s[2:3]
; GFX11-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v0
; GFX11-DL-NEXT: v_lshrrev_b32_e32 v6, 16, v1
; GFX11-DL-NEXT: v_and_b32_e32 v4, 0xff, v4
@@ -1950,7 +1956,7 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-DL-NEXT: v_mad_u16 v3, v4, v5, v3
; GFX11-DL-NEXT: v_mad_u16 v0, v1, v0, v3
-; GFX11-DL-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b16 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -2036,7 +2042,7 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_acc32_vecMul:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -2047,7 +2053,7 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v3
; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 8
@@ -2059,12 +2065,12 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_lshrrev_b16_e32 v7, 8, v0
; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s2
+; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s0
; GFX8-NEXT: v_mad_u32_u24 v0, v5, v7, v0
; GFX8-NEXT: v_mad_u32_u24 v0, v4, v6, v0
; GFX8-NEXT: v_mad_u32_u24 v2, v1, v2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -2106,34 +2112,36 @@ define amdgpu_kernel void @udot4_acc32_vecMul(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot4_acc32_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s2
-; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, s0
+; GFX10-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot4_acc32_vecMul:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -2208,7 +2216,7 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_acc16_vecMul:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v5, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -2220,8 +2228,8 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3
@@ -2315,16 +2323,17 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot4_acc16_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: v_mov_b32_e32 v8, 0xff
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_ushort v3, v0, s[2:3]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_lshrrev_b16 v4, 8, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
@@ -2348,21 +2357,22 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1
; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3
-; GFX10-DL-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot4_acc16_vecMul:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v7, 0xff, v1
-; GFX11-DL-NEXT: global_load_u16 v3, v2, s[0:1]
+; GFX11-DL-NEXT: global_load_u16 v3, v2, s[2:3]
; GFX11-DL-NEXT: v_lshrrev_b16 v4, 8, v1
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_lshrrev_b16 v5, 8, v0
@@ -2391,7 +2401,7 @@ define amdgpu_kernel void @udot4_acc16_vecMul(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_add_nc_u16 v0, v1, v0
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-NEXT: v_add_nc_u16 v0, v0, v3
-; GFX11-DL-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b16 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -2462,7 +2472,7 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_acc8_vecMul:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -2473,8 +2483,8 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
@@ -2554,15 +2564,16 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot4_acc8_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[2:3]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
@@ -2585,20 +2596,21 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v5
; GFX10-DL-NEXT: v_mad_u16 v1, v7, v8, v1
; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v2
-; GFX10-DL-NEXT: global_store_byte v0, v1, s[0:1]
+; GFX10-DL-NEXT: global_store_byte v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot4_acc8_vecMul:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: global_load_u8 v3, v2, s[0:1]
+; GFX11-DL-NEXT: global_load_u8 v3, v2, s[2:3]
; GFX11-DL-NEXT: s_waitcnt vmcnt(2)
; GFX11-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX11-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v1
@@ -2630,7 +2642,7 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-DL-NEXT: v_mad_u16 v0, v4, v7, v0
; GFX11-DL-NEXT: v_add_nc_u16 v0, v0, v1
-; GFX11-DL-NEXT: global_store_b8 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b8 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -2691,7 +2703,7 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_2ele:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -2702,7 +2714,7 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
; GFX8-NEXT: v_bfe_u32 v3, v3, 8, 8
@@ -2710,10 +2722,10 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0
; GFX8-NEXT: v_bfe_u32 v0, v0, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s2
+; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s0
; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -2757,43 +2769,45 @@ define amdgpu_kernel void @idot4_acc32_2ele(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_2ele:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc0c0100
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc0c0100
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_2ele:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc0c0100
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s2
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -2860,7 +2874,7 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_3ele:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -2871,7 +2885,7 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8
@@ -2880,12 +2894,12 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0
; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s2
+; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s0
; GFX8-NEXT: v_bfe_u32 v0, v0, 16, 8
; GFX8-NEXT: v_mad_u32_u24 v1, v4, v5, v1
; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -2933,43 +2947,45 @@ define amdgpu_kernel void @idot4_acc32_3ele(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_3ele:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc020100
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc020100
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_3ele:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc020100
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc020100
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s2
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -3043,7 +3059,7 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_3ele_permuted:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -3054,7 +3070,7 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v3
; GFX8-NEXT: v_and_b32_e32 v4, 0xff, v3
@@ -3063,12 +3079,12 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v0
; GFX8-NEXT: v_and_b32_e32 v5, 0xff, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s2
+; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s0
; GFX8-NEXT: v_bfe_u32 v0, v0, 16, 8
; GFX8-NEXT: v_mad_u32_u24 v1, v4, v5, v1
; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -3116,43 +3132,45 @@ define amdgpu_kernel void @idot4_acc32_3ele_permuted(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_3ele_permuted:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc020003
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc020003
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_3ele_permuted:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc020003
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc020003
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s2
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -3228,7 +3246,7 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_opt:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -3239,8 +3257,8 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v4, 0xff, v3
; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 8
@@ -3293,9 +3311,10 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_opt:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
@@ -3303,14 +3322,15 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, 0
-; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_opt:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
@@ -3318,7 +3338,7 @@ define amdgpu_kernel void @idot4_acc32_opt(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -3402,22 +3422,22 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1,
;
; GFX8-LABEL: udot4_acc32_3src:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s9
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s8, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
; GFX8-NEXT: v_bfe_u32 v2, v3, 8, 8
@@ -3433,20 +3453,20 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1,
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 24, v0
; GFX8-NEXT: v_mad_u32_u24 v1, v5, v6, v1
; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NEXT: v_mov_b32_e32 v1, s11
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-NODL-LABEL: udot4_acc32_3src:
; GFX9-NODL: ; %bb.0: ; %entry
-; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
-; GFX9-NODL-NEXT: global_load_dword v3, v0, s[4:5]
-; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT: global_load_dword v3, v0, s[8:9]
+; GFX9-NODL-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
@@ -3458,20 +3478,20 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_add3_u32 v2, v4, s0, v2
; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v5, v1
-; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot4_acc32_3src:
; GFX9-DL: ; %bb.0: ; %entry
-; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX9-DL-NEXT: global_load_dword v3, v0, s[0:1]
-; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-DL-NEXT: s_mov_b32 s0, 0x706010c
; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0c00
+; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[8:9]
+; GFX9-DL-NEXT: global_load_dword v3, v0, s[4:5]
+; GFX9-DL-NEXT: s_load_dword s1, s[10:11], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
; GFX9-DL-NEXT: v_perm_b32 v1, v2, v1, s0
@@ -3480,20 +3500,19 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_or_b32_e32 v1, v1, v2
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v3, v1, s1
-; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot4_acc32_3src:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x2
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT: global_load_dword v3, v0, s[0:1]
-; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[8:9]
+; GFX10-DL-NEXT: global_load_dword v3, v0, s[4:5]
+; GFX10-DL-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0x706010c
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@@ -3502,19 +3521,19 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v3, v0, s0
-; GFX10-DL-NEXT: global_store_dword v1, v0, s[6:7]
+; GFX10-DL-NEXT: global_store_dword v1, v0, s[10:11]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot4_acc32_3src:
; GFX11-DL: ; %bb.0: ; %entry
-; GFX11-DL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x2
-; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5]
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1]
-; GFX11-DL-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-DL-NEXT: global_load_b32 v2, v0, s[8:9]
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5]
+; GFX11-DL-NEXT: s_load_b32 s0, s[10:11], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v2, v1, 0x706010c
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
@@ -3524,7 +3543,7 @@ define amdgpu_kernel void @udot4_acc32_3src(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[6:7]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[10:11]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -3609,22 +3628,22 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1,
;
; GFX8-LABEL: udot4_acc32_3src_3ele:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s9
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s8, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
; GFX8-NEXT: v_bfe_u32 v2, v3, 8, 8
@@ -3637,20 +3656,20 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1,
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_u32 v0, v0, 16, 8
; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NEXT: v_mov_b32_e32 v1, s11
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-NODL-LABEL: udot4_acc32_3src_3ele:
; GFX9-NODL: ; %bb.0: ; %entry
-; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[0:1]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
-; GFX9-NODL-NEXT: global_load_dword v3, v0, s[4:5]
-; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX9-NODL-NEXT: global_load_dword v3, v0, s[8:9]
+; GFX9-NODL-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xff, v1
@@ -3661,21 +3680,21 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v4, v4, s0
; GFX9-NODL-NEXT: v_add3_u32 v1, v3, v2, v1
-; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot4_acc32_3src_3ele:
; GFX9-DL: ; %bb.0: ; %entry
-; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX9-DL-NEXT: global_load_dword v3, v0, s[0:1]
-; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: s_mov_b32 s0, 0xc06010c
; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0c00
; GFX9-DL-NEXT: s_mov_b32 s2, 0xc020100
+; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[8:9]
+; GFX9-DL-NEXT: global_load_dword v3, v0, s[4:5]
+; GFX9-DL-NEXT: s_load_dword s3, s[10:11], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
; GFX9-DL-NEXT: v_perm_b32 v1, v2, v1, s0
@@ -3685,20 +3704,19 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s2
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s3
-; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot4_acc32_3src_3ele:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x2
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT: global_load_dword v3, v0, s[0:1]
-; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[8:9]
+; GFX10-DL-NEXT: global_load_dword v3, v0, s[4:5]
+; GFX10-DL-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0xc06010c
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@@ -3708,19 +3726,19 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_perm_b32 v1, v3, v3, 0xc020100
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[10:11]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot4_acc32_3src_3ele:
; GFX11-DL: ; %bb.0: ; %entry
-; GFX11-DL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x2
-; GFX11-DL-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5]
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[0:1]
-; GFX11-DL-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7]
+; GFX11-DL-NEXT: global_load_b32 v2, v0, s[8:9]
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5]
+; GFX11-DL-NEXT: s_load_b32 s0, s[10:11], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v2, v1, 0xc06010c
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
@@ -3731,7 +3749,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[6:7]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[10:11]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -3811,9 +3829,9 @@ define amdgpu_kernel void @udot4_bad_source(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_bad_source:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x3c
+; GFX8-NEXT: s_load_dword s8, s[0:1], 0x3c
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -3823,122 +3841,122 @@ define amdgpu_kernel void @udot4_bad_source(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0
-; GFX8-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX8-NEXT: s_and_b32 s1, s8, 0xffff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v3
; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8
-; GFX8-NEXT: v_mad_u32_u24 v1, v2, s2, v1
+; GFX8-NEXT: v_mad_u32_u24 v1, v2, s1, v1
; GFX8-NEXT: v_bfe_u32 v3, v3, 16, 8
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8
; GFX8-NEXT: v_bfe_u32 v0, v0, 16, 8
; GFX8-NEXT: v_mad_u32_u24 v1, v4, v5, v1
; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-NODL-LABEL: udot4_bad_source:
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x3c
+; GFX9-NODL-NEXT: s_load_dword s8, s[0:1], 0x3c
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX9-NODL-NEXT: s_and_b32 s2, s2, 0xffff
+; GFX9-NODL-NEXT: s_and_b32 s1, s8, 0xffff
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NODL-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xff, v1
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s3
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, s2, v2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, v3, s1, v2
; GFX9-NODL-NEXT: v_add3_u32 v1, v2, v4, v1
-; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot4_bad_source:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x3c
+; GFX9-DL-NEXT: s_load_dword s8, s[0:1], 0x3c
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
+; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0201
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX9-DL-NEXT: s_mov_b32 s4, 0xc0c0201
-; GFX9-DL-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX9-DL-NEXT: s_and_b32 s4, s8, 0xffff
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
; GFX9-DL-NEXT: v_and_b32_e32 v4, 0xff, v1
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s4
-; GFX9-DL-NEXT: v_mad_u32_u24 v3, v4, s2, v3
-; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s4
+; GFX9-DL-NEXT: v_perm_b32 v2, v2, v2, s1
+; GFX9-DL-NEXT: v_mad_u32_u24 v3, v4, s4, v3
+; GFX9-DL-NEXT: v_perm_b32 v1, v1, v1, s1
; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, v3
-; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot4_bad_source:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x3c
+; GFX10-DL-NEXT: s_load_dword s8, s[0:1], 0x3c
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX10-DL-NEXT: s_load_dword s3, s[0:1], 0x0
+; GFX10-DL-NEXT: s_and_b32 s1, s8, 0xffff
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_and_b32_e32 v0, 0xff, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_perm_b32 v2, v2, v2, 0xc0c0201
; GFX10-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0201
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, s2, s3
+; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, s1, s0
; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v2, v0
-; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot4_bad_source:
; GFX11-DL: ; %bb.0: ; %entry
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x3c
+; GFX11-DL-NEXT: s_load_b32 s8, s[0:1], 0x3c
; GFX11-DL-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x44
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x44
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: s_and_b32 s2, s2, 0xffff
-; GFX11-DL-NEXT: s_load_b32 s3, s[0:1], 0x0
+; GFX11-DL-NEXT: s_and_b32 s1, s8, 0xffff
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_and_b32_e32 v2, 0xff, v1
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc0c0201
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0201
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DL-NEXT: v_mad_u32_u24 v2, v2, s2, s3
+; GFX11-DL-NEXT: v_mad_u32_u24 v2, v2, s1, s0
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, v2
-; GFX11-DL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v3, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -4019,7 +4037,7 @@ define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_commutative:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -4030,7 +4048,7 @@ define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
; GFX8-NEXT: v_bfe_u32 v4, v3, 8, 8
@@ -4039,12 +4057,12 @@ define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v0
; GFX8-NEXT: v_bfe_u32 v5, v0, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s2
+; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s0
; GFX8-NEXT: v_bfe_u32 v0, v0, 16, 8
; GFX8-NEXT: v_mad_u32_u24 v1, v4, v5, v1
; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -4092,43 +4110,45 @@ define amdgpu_kernel void @udot4_commutative(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot4_commutative:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v1, v1, 0xc020100
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0xc020100
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot4_commutative:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x3c
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x3c
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc020100
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0xc020100
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s2
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -4208,22 +4228,22 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
;
; GFX8-LABEL: udot4_acc32_3src_3ele_src0:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; GFX8-NEXT: v_mov_b32_e32 v1, s9
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s8, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_bfe_u32 v2, v3, 8, 8
; GFX8-NEXT: s_waitcnt vmcnt(1)
@@ -4235,20 +4255,20 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_u32 v0, v0, 16, 8
; GFX8-NEXT: v_mad_u32_u24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NEXT: v_mov_b32_e32 v1, s11
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-NODL-LABEL: udot4_acc32_3src_3ele_src0:
; GFX9-NODL: ; %bb.0: ; %entry
-; GFX9-NODL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-NODL-NEXT: global_load_dword v2, v0, s[4:5]
-; GFX9-NODL-NEXT: global_load_dword v3, v0, s[0:1]
-; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-NODL-NEXT: global_load_dword v1, v0, s[6:7]
+; GFX9-NODL-NEXT: global_load_dword v2, v0, s[8:9]
+; GFX9-NODL-NEXT: global_load_dword v3, v0, s[4:5]
+; GFX9-NODL-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
; GFX9-NODL-NEXT: v_bfe_u32 v4, v1, 8, 8
@@ -4259,21 +4279,21 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_mad_u32_u24 v3, v4, v4, s0
; GFX9-NODL-NEXT: v_add3_u32 v1, v3, v2, v1
-; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot4_acc32_3src_3ele_src0:
; GFX9-DL: ; %bb.0: ; %entry
-; GFX9-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
-; GFX9-DL-NEXT: global_load_dword v2, v0, s[0:1]
-; GFX9-DL-NEXT: global_load_dword v3, v0, s[2:3]
-; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: s_mov_b32 s0, 0xc06010c
; GFX9-DL-NEXT: s_mov_b32 s1, 0xc0c0c01
; GFX9-DL-NEXT: s_mov_b32 s2, 0xc020101
+; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9]
+; GFX9-DL-NEXT: global_load_dword v2, v0, s[4:5]
+; GFX9-DL-NEXT: global_load_dword v3, v0, s[6:7]
+; GFX9-DL-NEXT: s_load_dword s3, s[10:11], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
; GFX9-DL-NEXT: v_perm_b32 v1, v1, v2, s0
@@ -4283,20 +4303,19 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_perm_b32 v2, v3, v3, s2
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v2, v1, s3
-; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[10:11]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot4_acc32_3src_3ele_src0:
; GFX10-DL: ; %bb.0: ; %entry
-; GFX10-DL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x2
-; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
-; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1]
-; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3]
-; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX10-DL-NEXT: global_load_dword v1, v0, s[8:9]
+; GFX10-DL-NEXT: global_load_dword v2, v0, s[4:5]
+; GFX10-DL-NEXT: global_load_dword v3, v0, s[6:7]
+; GFX10-DL-NEXT: s_load_dword s0, s[10:11], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v0, v1, v2, 0xc06010c
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@@ -4306,19 +4325,19 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_perm_b32 v1, v3, v3, 0xc020101
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[10:11]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot4_acc32_3src_3ele_src0:
; GFX11-DL: ; %bb.0: ; %entry
-; GFX11-DL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x2
-; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
-; GFX11-DL-NEXT: global_load_b32 v2, v0, s[0:1]
-; GFX11-DL-NEXT: global_load_b32 v0, v0, s[2:3]
-; GFX11-DL-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-DL-NEXT: global_load_b32 v1, v0, s[8:9]
+; GFX11-DL-NEXT: global_load_b32 v2, v0, s[4:5]
+; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
+; GFX11-DL-NEXT: s_load_b32 s0, s[10:11], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v2, 0xc06010c
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
@@ -4329,7 +4348,7 @@ define amdgpu_kernel void @udot4_acc32_3src_3ele_src0(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, s0
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[6:7]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[10:11]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -4417,7 +4436,7 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1,
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -4435,12 +4454,12 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s10, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(3)
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
; GFX8-NEXT: v_bfe_u32 v2, v3, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s2
+; GFX8-NEXT: v_mad_u32_u24 v1, v1, v2, s0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX8-NEXT: v_bfe_u32 v4, v4, 8, 8
@@ -4453,8 +4472,8 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1,
; GFX8-NEXT: v_and_b32_e32 v7, 0xff, v0
; GFX8-NEXT: v_bfe_u32 v0, v0, 8, 8
; GFX8-NEXT: v_mad_u32_u24 v2, v7, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -4462,14 +4481,14 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1,
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NODL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX9-NODL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX9-NODL-NEXT: global_load_dword v3, v0, s[8:9]
; GFX9-NODL-NEXT: global_load_dword v4, v0, s[10:11]
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX9-NODL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(3)
; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
; GFX9-NODL-NEXT: s_waitcnt vmcnt(2)
@@ -4479,52 +4498,52 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v4, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_1
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s2, v2
+; GFX9-NODL-NEXT: v_add3_u32 v1, v1, s0, v2
; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v4
-; GFX9-NODL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NODL-NEXT: s_endpgm
;
; GFX9-DL-LABEL: udot4_4src:
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
-; GFX9-DL-NEXT: s_mov_b32 s2, 0xc0c0501
-; GFX9-DL-NEXT: s_mov_b32 s3, 0x5010c0c
+; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
+; GFX9-DL-NEXT: s_mov_b32 s0, 0xc0c0501
+; GFX9-DL-NEXT: s_mov_b32 s1, 0x5010c0c
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX9-DL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX9-DL-NEXT: global_load_dword v3, v0, s[8:9]
; GFX9-DL-NEXT: global_load_dword v4, v0, s[10:11]
; GFX9-DL-NEXT: s_mov_b32 s4, 0xc0c0400
-; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0
+; GFX9-DL-NEXT: s_load_dword s6, s[2:3], 0x0
; GFX9-DL-NEXT: s_mov_b32 s5, 0x4000c0c
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT: v_perm_b32 v5, v2, v1, s2
+; GFX9-DL-NEXT: v_perm_b32 v5, v2, v1, s0
; GFX9-DL-NEXT: v_perm_b32 v1, v2, v1, s4
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_perm_b32 v6, v4, v3, s3
+; GFX9-DL-NEXT: v_perm_b32 v6, v4, v3, s1
; GFX9-DL-NEXT: v_perm_b32 v2, v4, v3, s5
; GFX9-DL-NEXT: v_or_b32_e32 v3, v6, v5
; GFX9-DL-NEXT: v_or_b32_e32 v1, v2, v1
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_dot4_u32_u8 v1, v1, v3, s6
-; GFX9-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-DL-NEXT: s_endpgm
;
; GFX10-DL-LABEL: udot4_4src:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x3
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: global_load_dword v3, v0, s[8:9]
; GFX10-DL-NEXT: global_load_dword v4, v0, s[10:11]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0xc0c0501
; GFX10-DL-NEXT: v_perm_b32 v1, v2, v1, 0xc0c0400
@@ -4535,22 +4554,22 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_or_b32_e32 v1, v2, v1
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot4_4src:
; GFX11-DL: ; %bb.0: ; %entry
; GFX11-DL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x44
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x44
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x3
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v2, v0, s[6:7]
; GFX11-DL-NEXT: global_load_b32 v3, v0, s[8:9]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[10:11]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(2)
; GFX11-DL-NEXT: v_perm_b32 v4, v2, v1, 0xc0c0501
; GFX11-DL-NEXT: v_perm_b32 v1, v2, v1, 0xc0c0400
@@ -4563,8 +4582,8 @@ define amdgpu_kernel void @udot4_4src(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, s2
-; GFX11-DL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, s0
+; GFX11-DL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -4667,7 +4686,7 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1,
; GFX8-LABEL: udot4_acc32_multi:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -4678,7 +4697,7 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1,
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GFX8-NEXT: flat_load_dword v2, v[2:3]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -4686,7 +4705,7 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 8
; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v3, v3, v4, s2
+; GFX8-NEXT: v_mad_u32_u24 v3, v3, v4, s0
; GFX8-NEXT: v_and_b32_e32 v9, 0xff, v1
; GFX8-NEXT: v_mad_u32_u24 v3, v7, v8, v3
; GFX8-NEXT: v_bfe_u32 v11, v1, 16, 8
@@ -4702,8 +4721,8 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1,
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v1
; GFX8-NEXT: v_mad_u32_u24 v0, v10, v6, v0
; GFX8-NEXT: v_mad_u32_u24 v2, v1, v2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -4762,37 +4781,39 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: udot4_acc32_multi:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
; GFX10-DL-NEXT: global_load_dword v3, v2, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_perm_b32 v2, v1, v0, 0x6040200
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_perm_b32 v4, v3, v3, 0x2000200
; GFX10-DL-NEXT: v_perm_b32 v0, v1, v0, 0x7050301
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v2, v4, s2
+; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v2, v4, s0
; GFX10-DL-NEXT: v_perm_b32 v2, v3, v3, 0x3010301
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, v1
-; GFX10-DL-NEXT: global_store_dword v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v3, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: udot4_acc32_multi:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b64 v[0:1], v2, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v2, v2, s[6:7]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(1)
; GFX11-DL-NEXT: v_perm_b32 v3, v1, v0, 0x6040200
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
@@ -4801,10 +4822,10 @@ define amdgpu_kernel void @udot4_acc32_multi(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_perm_b32 v2, v2, v2, 0x3010301
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v1, v3, v4, s2
+; GFX11-DL-NEXT: v_dot4_u32_u8 v1, v3, v4, s0
; GFX11-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v2, v1
-; GFX11-DL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v3, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -4915,7 +4936,7 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_hilo:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -4928,8 +4949,8 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 8
@@ -4982,9 +5003,10 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_hilo:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5] offset:4
@@ -4992,14 +5014,15 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_dot4_u32_u8 v1, v1, v2, 0
-; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_hilo:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
@@ -5007,7 +5030,7 @@ define amdgpu_kernel void @idot4_acc32_hilo(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -5088,7 +5111,7 @@ define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_lohi:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -5101,8 +5124,8 @@ define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 8
@@ -5160,9 +5183,10 @@ define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_lohi:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] offset:4
@@ -5173,14 +5197,15 @@ define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0x3020001
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_lohi:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
@@ -5192,7 +5217,7 @@ define amdgpu_kernel void @idot4_acc32_lohi(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0x3020001
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -5273,7 +5298,7 @@ define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_hihi:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -5288,8 +5313,8 @@ define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v4, 0xff, v2
; GFX8-NEXT: v_bfe_u32 v7, v2, 8, 8
@@ -5347,9 +5372,10 @@ define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_hihi:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[6:7] offset:4
@@ -5360,14 +5386,15 @@ define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_perm_b32 v1, v2, v2, 0x3010002
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_hihi:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
@@ -5379,7 +5406,7 @@ define amdgpu_kernel void @idot4_acc32_hihi(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0x3010002
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -5456,7 +5483,7 @@ define amdgpu_kernel void @idot4_acc32_v8i8(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_v8i8:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s3
@@ -5474,8 +5501,8 @@ define amdgpu_kernel void @idot4_acc32_v8i8(ptr addrspace(1) %src1,
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 24, v1
; GFX8-NEXT: v_mad_u32_u24 v2, v5, v6, v2
; GFX8-NEXT: v_mad_u32_u24 v2, v0, v1, v2
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -5513,28 +5540,30 @@ define amdgpu_kernel void @idot4_acc32_v8i8(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_v8i8:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[4:5]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_v8i8:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: global_load_b64 v[0:1], v0, s[2:3]
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[4:5]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -5617,7 +5646,7 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_v16i8:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 4, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -5630,8 +5659,8 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1,
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[1:2]
; GFX8-NEXT: flat_load_dword v4, v[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_and_b32_e32 v5, 0xff, v2
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v6, 0xff, v4
@@ -5696,10 +5725,11 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_v16i8:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v5, 3, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: ; kill: killed $vgpr5
; GFX10-DL-NEXT: ; kill: killed $vgpr4
; GFX10-DL-NEXT: ; kill: killed $sgpr4_sgpr5_sgpr6 killed $sgpr7
@@ -5712,15 +5742,16 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_perm_b32 v0, v0, v0, 0x3020001
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_v16i8:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v1, 4, v0
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: global_load_b128 v[0:3], v1, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v4, s[6:7]
@@ -5731,7 +5762,7 @@ define amdgpu_kernel void @idot4_acc32_v16i8(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0x3020001
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -5814,10 +5845,10 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_v256i8:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GFX8-NEXT: s_movk_i32 s2, 0xfc
+; GFX8-NEXT: s_movk_i32 s0, 0xfc
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, s5
; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v1
@@ -5826,11 +5857,11 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v3
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v4
; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 8
@@ -5890,10 +5921,11 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_v256i8:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 8, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: global_load_dword v2, v1, s[6:7]
; GFX10-DL-NEXT: global_load_dword v3, v0, s[4:5] offset:252
@@ -5903,15 +5935,16 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_perm_b32 v1, v3, v3, 0x1000302
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, 0
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_v256i8:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v1, 3, v0
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 8, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: global_load_b32 v1, v1, s[6:7]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[4:5] offset:252
@@ -5921,7 +5954,7 @@ define amdgpu_kernel void @idot4_acc32_v256i8(ptr addrspace(1) %src1,
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v0, 0x1000302
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v0, v1, 0
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
@@ -5997,7 +6030,7 @@ define amdgpu_kernel void @idot4_acc32_anyext(ptr addrspace(1) %src1,
; GFX8-LABEL: idot4_acc32_anyext:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -6008,17 +6041,17 @@ define amdgpu_kernel void @idot4_acc32_anyext(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v3
; GFX8-NEXT: v_bfe_u32 v2, v3, 8, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v1, v1, v1, s2
+; GFX8-NEXT: v_mad_u32_u24 v1, v1, v1, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_u32 v0, v0, 8, 8
; GFX8-NEXT: v_mad_u32_u24 v2, v2, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -6063,41 +6096,43 @@ define amdgpu_kernel void @idot4_acc32_anyext(ptr addrspace(1) %src1,
;
; GFX10-DL-LABEL: idot4_acc32_anyext:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_perm_b32 v0, v2, v1, 0xc0c0500
; GFX10-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2
-; GFX10-DL-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
+; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
;
; GFX11-DL-LABEL: idot4_acc32_anyext:
; GFX11-DL: ; %bb.0: ; %entry
+; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-DL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-DL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_clause 0x1
; GFX11-DL-NEXT: global_load_b32 v1, v0, s[4:5]
; GFX11-DL-NEXT: global_load_b32 v0, v0, s[6:7]
-; GFX11-DL-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-DL-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-DL-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-NEXT: v_perm_b32 v0, v0, v1, 0xc0c0500
; GFX11-DL-NEXT: v_perm_b32 v1, v1, v1, 0xc0c0100
; GFX11-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s2
-; GFX11-DL-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-DL-NEXT: v_dot4_u32_u8 v0, v1, v0, s0
+; GFX11-DL-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-DL-NEXT: s_nop 0
; GFX11-DL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll
index 8c53d2671de3f..036965df60dd2 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll
@@ -63,11 +63,15 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
;
; GFX8-LABEL: idot8_acc32:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -77,11 +81,7 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 4
; GFX8-NEXT: v_bfe_i32 v4, v3, 4, 4
@@ -93,7 +93,7 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 4
; GFX8-NEXT: v_bfe_i32 v5, v0, 4, 4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s2
+; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, s0
; GFX8-NEXT: v_bfe_i32 v7, v0, 8, 4
; GFX8-NEXT: v_mad_i32_i24 v1, v4, v5, v1
; GFX8-NEXT: v_bfe_i32 v9, v0, 12, 4
@@ -109,8 +109,8 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 28, v0
; GFX8-NEXT: v_mad_i32_i24 v1, v14, v15, v1
; GFX8-NEXT: v_mad_i32_i24 v2, v3, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -189,44 +189,44 @@ define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
; GFX10-DL-XNACK: ; %bb.0: ; %entry
; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-XNACK-NEXT: s_clause 0x1
; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-XNACK-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-XNACK-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v1, v2, s2
-; GFX10-DL-XNACK-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v1, v2, s0
+; GFX10-DL-XNACK-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-XNACK-NEXT: s_endpgm
;
; GFX10-DL-NOXNACK-LABEL: idot8_acc32:
; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
-; GFX10-DL-NOXNACK-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, v1, v0, s2
-; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, v1, v0, s0
+; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NOXNACK-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -372,11 +372,16 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1,
;
; GFX8-LABEL: idot8_acc16:
; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v5, 12
-; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -386,14 +391,9 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3
; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3
@@ -599,20 +599,20 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1,
; GFX10-DL-XNACK-LABEL: idot8_acc16:
; GFX10-DL-XNACK: ; %bb.0: ; %entry
; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-XNACK-NEXT: s_clause 0x1
; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-XNACK-NEXT: global_load_ushort v3, v0, s[0:1]
+; GFX10-DL-XNACK-NEXT: global_load_ushort v3, v0, s[2:3]
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v4, 28, v1
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 24, v1
@@ -670,27 +670,26 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1,
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v4, v5, v1
-; GFX10-DL-XNACK-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-DL-XNACK-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-DL-XNACK-NEXT: s_endpgm
;
; GFX10-DL-NOXNACK-LABEL: idot8_acc16:
; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
-; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
-; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[0:1]
+; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[2:3]
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v4, 28, v1
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 24, v1
@@ -748,7 +747,7 @@ define amdgpu_kernel void @idot8_acc16(ptr addrspace(1) %src1,
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v4, v5, v0
-; GFX10-DL-NOXNACK-NEXT: global_store_short v2, v0, s[0:1]
+; GFX10-DL-NOXNACK-NEXT: global_store_short v2, v0, s[2:3]
; GFX10-DL-NOXNACK-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -893,11 +892,16 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1,
;
; GFX8-LABEL: idot8_acc8:
; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v5, 12
-; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -907,14 +911,9 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v10, 4, v3
; GFX8-NEXT: v_lshlrev_b16_e32 v16, 12, v3
@@ -1120,20 +1119,20 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1,
; GFX10-DL-XNACK-LABEL: idot8_acc8:
; GFX10-DL-XNACK: ; %bb.0: ; %entry
; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-XNACK-NEXT: s_clause 0x1
; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-XNACK-NEXT: global_load_ubyte v3, v0, s[0:1]
+; GFX10-DL-XNACK-NEXT: global_load_ubyte v3, v0, s[2:3]
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v4, 28, v1
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 24, v1
@@ -1191,27 +1190,26 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1,
; GFX10-DL-XNACK-NEXT: v_ashrrev_i16 v5, 12, v5
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v2, v3, v1
; GFX10-DL-XNACK-NEXT: v_mad_u16 v1, v4, v5, v1
-; GFX10-DL-XNACK-NEXT: global_store_byte v0, v1, s[0:1]
+; GFX10-DL-XNACK-NEXT: global_store_byte v0, v1, s[2:3]
; GFX10-DL-XNACK-NEXT: s_endpgm
;
; GFX10-DL-NOXNACK-LABEL: idot8_acc8:
; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
-; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
-; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v3, v2, s[0:1]
+; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v3, v2, s[2:3]
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v4, 28, v1
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 24, v1
@@ -1269,7 +1267,7 @@ define amdgpu_kernel void @idot8_acc8(ptr addrspace(1) %src1,
; GFX10-DL-NOXNACK-NEXT: v_ashrrev_i16 v5, 12, v5
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v1, v3, v0
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v4, v5, v0
-; GFX10-DL-NOXNACK-NEXT: global_store_byte v2, v0, s[0:1]
+; GFX10-DL-NOXNACK-NEXT: global_store_byte v2, v0, s[2:3]
; GFX10-DL-NOXNACK-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -1401,11 +1399,15 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
;
; GFX8-LABEL: idot8_multiuses_mul1:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -1415,11 +1417,7 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_bfe_i32 v1, v3, 0, 4
; GFX8-NEXT: v_bfe_i32 v4, v3, 4, 4
@@ -1430,7 +1428,7 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v16, v1, v2, s2
+; GFX8-NEXT: v_mad_i32_i24 v16, v1, v2, s0
; GFX8-NEXT: v_bfe_i32 v5, v0, 4, 4
; GFX8-NEXT: v_mad_i32_i24 v1, v1, v2, v16
; GFX8-NEXT: v_bfe_i32 v7, v0, 8, 4
@@ -1449,8 +1447,8 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mad_i32_i24 v1, v14, v15, v1
; GFX8-NEXT: v_mad_i32_i24 v0, v3, v0, v1
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v16, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1560,18 +1558,18 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX10-DL-XNACK: ; %bb.0: ; %entry
; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-XNACK-NEXT: s_clause 0x1
; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-XNACK-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-XNACK-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-XNACK-NEXT: v_bfe_i32 v0, v1, 0, 4
; GFX10-DL-XNACK-NEXT: v_bfe_i32 v3, v1, 4, 4
@@ -1585,7 +1583,7 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX10-DL-XNACK-NEXT: v_bfe_i32 v9, v2, 12, 4
; GFX10-DL-XNACK-NEXT: v_mul_i32_i24_e32 v4, v5, v6
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v5, v0, v7, s2
+; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v5, v0, v7, s0
; GFX10-DL-XNACK-NEXT: v_bfe_i32 v6, v1, 16, 4
; GFX10-DL-XNACK-NEXT: v_bfe_i32 v10, v2, 16, 4
; GFX10-DL-XNACK-NEXT: v_bfe_i32 v11, v1, 20, 4
@@ -1605,25 +1603,25 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v3, v4
; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-XNACK-NEXT: v_add3_u32 v0, v0, v1, v5
-; GFX10-DL-XNACK-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-XNACK-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-XNACK-NEXT: s_endpgm
;
; GFX10-DL-NOXNACK-LABEL: idot8_multiuses_mul1:
; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
-; GFX10-DL-NOXNACK-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v2, v1, 0, 4
; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v3, v1, 4, 4
@@ -1637,7 +1635,7 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v9, v0, 12, 4
; GFX10-DL-NOXNACK-NEXT: v_mul_i32_i24_e32 v4, v5, v6
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v5, v2, v7, s2
+; GFX10-DL-NOXNACK-NEXT: v_mad_i32_i24 v5, v2, v7, s0
; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v6, v1, 16, 4
; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v10, v0, 16, 4
; GFX10-DL-NOXNACK-NEXT: v_bfe_i32 v11, v1, 20, 4
@@ -1657,7 +1655,7 @@ define amdgpu_kernel void @idot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v1, v2, v3, v4
; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: v_add3_u32 v0, v1, v0, v5
-; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NOXNACK-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -1788,11 +1786,15 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
;
; GFX8-LABEL: idot8_acc32_vecMul:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -1802,11 +1804,7 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_ashrrev_i32_e32 v1, 28, v3
; GFX8-NEXT: v_bfe_i32 v2, v3, 24, 4
@@ -1826,7 +1824,7 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_i32 v15, v0, 4, 4
; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v0, v3, v0, s2
+; GFX8-NEXT: v_mad_i32_i24 v0, v3, v0, s0
; GFX8-NEXT: v_mad_i32_i24 v0, v8, v15, v0
; GFX8-NEXT: v_mad_i32_i24 v0, v7, v14, v0
; GFX8-NEXT: v_mad_i32_i24 v0, v6, v13, v0
@@ -1834,8 +1832,8 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mad_i32_i24 v0, v4, v11, v0
; GFX8-NEXT: v_mad_i32_i24 v0, v2, v10, v0
; GFX8-NEXT: v_mad_i32_i24 v2, v1, v9, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1914,44 +1912,44 @@ define amdgpu_kernel void @idot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-XNACK: ; %bb.0: ; %entry
; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-XNACK-NEXT: s_clause 0x1
; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-XNACK-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-XNACK-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v1, v2, s2
-; GFX10-DL-XNACK-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v1, v1, v2, s0
+; GFX10-DL-XNACK-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-XNACK-NEXT: s_endpgm
;
; GFX10-DL-NOXNACK-LABEL: idot8_acc32_vecMul:
; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
-; GFX10-DL-NOXNACK-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NOXNACK-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, v1, v0, s2
-; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-DL-NOXNACK-NEXT: v_dot8_i32_i4 v0, v1, v0, s0
+; GFX10-DL-NOXNACK-NEXT: global_store_dword v2, v0, s[2:3]
; GFX10-DL-NOXNACK-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -2061,11 +2059,16 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1,
;
; GFX8-LABEL: idot8_acc16_vecMul:
; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v5, 12
-; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -2075,14 +2078,9 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3
; GFX8-NEXT: v_lshlrev_b16_sdwa v7, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
@@ -2315,19 +2313,19 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-XNACK: ; %bb.0: ; %entry
; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-XNACK-NEXT: s_clause 0x1
; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-XNACK-NEXT: global_load_ushort v3, v0, s[0:1]
+; GFX10-DL-XNACK-NEXT: global_load_ushort v3, v0, s[2:3]
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v5, 4, v1
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1)
@@ -2401,26 +2399,26 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v2, v1
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v1, v1, v3
-; GFX10-DL-XNACK-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-DL-XNACK-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-DL-XNACK-NEXT: s_endpgm
;
; GFX10-DL-NOXNACK-LABEL: idot8_acc16_vecMul:
; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
-; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[0:1]
+; GFX10-DL-NOXNACK-NEXT: global_load_ushort v3, v2, s[2:3]
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v5, 4, v1
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1)
@@ -2494,7 +2492,7 @@ define amdgpu_kernel void @idot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v1, v0
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v3
-; GFX10-DL-NOXNACK-NEXT: global_store_short v2, v0, s[0:1]
+; GFX10-DL-NOXNACK-NEXT: global_store_short v2, v0, s[2:3]
; GFX10-DL-NOXNACK-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -2604,11 +2602,16 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1,
;
; GFX8-LABEL: idot8_acc8_vecMul:
; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: v_mov_b32_e32 v5, 12
-; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -2618,14 +2621,9 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 20, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v7, 28, v3
@@ -2890,19 +2888,19 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-XNACK: ; %bb.0: ; %entry
; GFX10-DL-XNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-XNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v4, 0
; GFX10-DL-XNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-XNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-XNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-XNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-XNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v4, 0
; GFX10-DL-XNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-XNACK-NEXT: s_clause 0x1
; GFX10-DL-XNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-XNACK-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-XNACK-NEXT: global_load_ubyte v3, v4, s[0:1]
+; GFX10-DL-XNACK-NEXT: global_load_ubyte v3, v4, s[2:3]
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1
; GFX10-DL-XNACK-NEXT: s_waitcnt vmcnt(1)
@@ -2983,26 +2981,26 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-XNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v6
; GFX10-DL-XNACK-NEXT: v_mad_u16 v0, v7, v14, v0
; GFX10-DL-XNACK-NEXT: v_add_nc_u16 v0, v0, v1
-; GFX10-DL-XNACK-NEXT: global_store_byte v4, v0, s[0:1]
+; GFX10-DL-XNACK-NEXT: global_store_byte v4, v0, s[2:3]
; GFX10-DL-XNACK-NEXT: s_endpgm
;
; GFX10-DL-NOXNACK-LABEL: idot8_acc8_vecMul:
; GFX10-DL-NOXNACK: ; %bb.0: ; %entry
; GFX10-DL-NOXNACK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NOXNACK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v4, 0
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NOXNACK-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NOXNACK-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-NOXNACK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NOXNACK-NEXT: v_mov_b32_e32 v4, 0
; GFX10-DL-NOXNACK-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NOXNACK-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NOXNACK-NEXT: s_clause 0x1
; GFX10-DL-NOXNACK-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NOXNACK-NEXT: global_load_dword v0, v0, s[6:7]
-; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v2, v4, s[0:1]
+; GFX10-DL-NOXNACK-NEXT: global_load_ubyte v2, v4, s[2:3]
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v8, 12, v1
; GFX10-DL-NOXNACK-NEXT: s_waitcnt vmcnt(1)
@@ -3083,7 +3081,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NOXNACK-NEXT: v_lshrrev_b32_e32 v1, 8, v6
; GFX10-DL-NOXNACK-NEXT: v_mad_u16 v0, v7, v14, v0
; GFX10-DL-NOXNACK-NEXT: v_add_nc_u16 v0, v0, v1
-; GFX10-DL-NOXNACK-NEXT: global_store_byte v4, v0, s[0:1]
+; GFX10-DL-NOXNACK-NEXT: global_store_byte v4, v0, s[2:3]
; GFX10-DL-NOXNACK-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll
index 3828fa557731e..f29908ad38e0d 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll
@@ -61,11 +61,15 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
;
; GFX8-LABEL: udot8_acc32:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -75,11 +79,7 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3
; GFX8-NEXT: v_bfe_u32 v2, v3, 24, 4
@@ -99,7 +99,7 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_u32 v15, v0, 4, 4
; GFX8-NEXT: v_and_b32_e32 v0, 15, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s2
+; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s0
; GFX8-NEXT: v_mad_u32_u24 v0, v8, v15, v0
; GFX8-NEXT: v_mad_u32_u24 v0, v7, v14, v0
; GFX8-NEXT: v_mad_u32_u24 v0, v6, v13, v0
@@ -107,8 +107,8 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mad_u32_u24 v0, v4, v11, v0
; GFX8-NEXT: v_mad_u32_u24 v0, v2, v10, v0
; GFX8-NEXT: v_mad_u32_u24 v2, v1, v9, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -187,22 +187,22 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s2
-; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0
+; GFX10-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -332,11 +332,15 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1,
;
; GFX8-LABEL: udot8_acc16:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -346,13 +350,9 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3
; GFX8-NEXT: v_bfe_u32 v6, v3, 24, 4
@@ -650,11 +650,15 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1,
;
; GFX8-LABEL: udot8_acc8:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -664,13 +668,9 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3
; GFX8-NEXT: v_bfe_u32 v6, v3, 24, 4
@@ -969,11 +969,15 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1,
;
; GFX8-LABEL: udot8_acc4:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -983,13 +987,9 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3
@@ -1276,11 +1276,15 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1,
;
; GFX8-LABEL: udot8_CommutationInsideMAD:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -1290,13 +1294,9 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3
@@ -1582,11 +1582,15 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
;
; GFX8-LABEL: udot8_multiuses_mul1:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -1596,11 +1600,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3
; GFX8-NEXT: v_bfe_u32 v2, v3, 24, 4
@@ -1620,7 +1620,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_u32 v15, v0, 4, 4
; GFX8-NEXT: v_and_b32_e32 v0, 15, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v16, v3, v0, s2
+; GFX8-NEXT: v_mad_u32_u24 v16, v3, v0, s0
; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, v16
; GFX8-NEXT: v_mad_u32_u24 v3, v8, v15, v16
; GFX8-NEXT: v_mad_u32_u24 v3, v7, v14, v3
@@ -1630,8 +1630,8 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mad_u32_u24 v2, v2, v10, v3
; GFX8-NEXT: v_mad_u32_u24 v1, v1, v9, v2
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1741,18 +1741,18 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
; GFX10-DL-NEXT: v_and_b32_e32 v8, 15, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
@@ -1768,7 +1768,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 8, 4
; GFX10-DL-NEXT: v_bfe_u32 v12, v2, 12, 4
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT: v_mad_u32_u24 v13, v8, v9, s2
+; GFX10-DL-NEXT: v_mad_u32_u24 v13, v8, v9, s0
; GFX10-DL-NEXT: v_bfe_u32 v14, v2, 20, 4
; GFX10-DL-NEXT: v_bfe_u32 v15, v2, 16, 4
; GFX10-DL-NEXT: v_mul_u32_u24_e32 v1, v1, v11
@@ -1786,7 +1786,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v2
; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-DL-NEXT: v_add3_u32 v0, v3, v13, v0
-; GFX10-DL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -1916,11 +1916,15 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1,
;
; GFX8-LABEL: udot8_acc32_vecMul:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -1930,11 +1934,7 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3
; GFX8-NEXT: v_bfe_u32 v2, v3, 24, 4
@@ -1954,7 +1954,7 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_bfe_u32 v15, v0, 4, 4
; GFX8-NEXT: v_and_b32_e32 v0, 15, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s2
+; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s0
; GFX8-NEXT: v_mad_u32_u24 v0, v8, v15, v0
; GFX8-NEXT: v_mad_u32_u24 v0, v7, v14, v0
; GFX8-NEXT: v_mad_u32_u24 v0, v6, v13, v0
@@ -1962,8 +1962,8 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_mad_u32_u24 v0, v4, v11, v0
; GFX8-NEXT: v_mad_u32_u24 v0, v2, v10, v0
; GFX8-NEXT: v_mad_u32_u24 v2, v1, v9, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -2042,22 +2042,22 @@ define amdgpu_kernel void @udot8_acc32_vecMul(ptr addrspace(1) %src1,
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s2
-; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v1, v2, s0
+; GFX10-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -2152,11 +2152,15 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1,
;
; GFX8-LABEL: udot8_acc16_vecMul:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -2166,13 +2170,9 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ushort v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3
; GFX8-NEXT: v_bfe_u32 v6, v3, 24, 4
@@ -2324,20 +2324,20 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-LABEL: udot8_acc16_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: global_load_ushort v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_ushort v3, v0, s[2:3]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
@@ -2381,7 +2381,7 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1
; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3
-; GFX10-DL-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-DL-NEXT: global_store_short v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -2475,11 +2475,15 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1,
;
; GFX8-LABEL: udot8_acc8_vecMul:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -2489,13 +2493,9 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v9, 28, v3
; GFX8-NEXT: v_bfe_u32 v10, v3, 24, 4
@@ -2680,19 +2680,19 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0
; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0
; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
-; GFX10-DL-NEXT: global_load_ubyte v3, v4, s[0:1]
+; GFX10-DL-NEXT: global_load_ubyte v3, v4, s[2:3]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 12, 4
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
@@ -2743,7 +2743,7 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v8
; GFX10-DL-NEXT: v_mad_u16 v0, v10, v16, v0
; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1
-; GFX10-DL-NEXT: global_store_byte v4, v0, s[0:1]
+; GFX10-DL-NEXT: global_store_byte v4, v0, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -2838,11 +2838,15 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1,
;
; GFX8-LABEL: udot8_acc4_vecMul:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX8-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX8-NEXT: s_mov_b32 s10, -1
+; GFX8-NEXT: s_mov_b32 s11, 0xe80000
+; GFX8-NEXT: s_add_u32 s8, s8, s3
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2
@@ -2852,13 +2856,9 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_load_ubyte v4, v[0:1]
-; GFX8-NEXT: s_mov_b32 s10, -1
-; GFX8-NEXT: s_mov_b32 s11, 0xe80000
-; GFX8-NEXT: s_add_u32 s8, s8, s3
-; GFX8-NEXT: s_addc_u32 s9, s9, 0
; GFX8-NEXT: s_waitcnt vmcnt(2)
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3
@@ -3013,20 +3013,20 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-LABEL: udot8_acc4_vecMul:
; GFX10-DL: ; %bb.0: ; %entry
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-DL-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-DL-NEXT: s_mov_b32 s10, -1
; GFX10-DL-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-DL-NEXT: s_add_u32 s8, s8, s3
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-DL-NEXT: s_addc_u32 s9, s9, 0
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[0:1]
+; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[2:3]
; GFX10-DL-NEXT: s_waitcnt vmcnt(2)
; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(1)
@@ -3071,7 +3071,7 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1
; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3
; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1
-; GFX10-DL-NEXT: global_store_byte v0, v1, s[0:1]
+; GFX10-DL-NEXT: global_store_byte v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
@@ -3156,7 +3156,7 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr,
; GFX8-LABEL: udot8_variant1:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s5
@@ -3167,7 +3167,7 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr,
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[0:1]
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
+; GFX8-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_and_b32_e32 v1, 15, v3
; GFX8-NEXT: v_bfe_u32 v4, v3, 4, 4
@@ -3187,7 +3187,7 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr,
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 28, v3
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 28, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v1, v2, v1, s2
+; GFX8-NEXT: v_mad_u32_u24 v1, v2, v1, s0
; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, v1
; GFX8-NEXT: v_mad_u32_u24 v0, v5, v4, v0
; GFX8-NEXT: v_mad_u32_u24 v0, v7, v6, v0
@@ -3195,8 +3195,8 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr,
; GFX8-NEXT: v_mad_u32_u24 v0, v11, v10, v0
; GFX8-NEXT: v_mad_u32_u24 v0, v13, v12, v0
; GFX8-NEXT: v_mad_u32_u24 v2, v15, v14, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -3261,18 +3261,19 @@ define amdgpu_kernel void @udot8_variant1(ptr addrspace(1) %v1addr,
;
; GFX10-DL-LABEL: udot8_variant1:
; GFX10-DL: ; %bb.0: ; %entry
+; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: s_clause 0x1
; GFX10-DL-NEXT: global_load_dword v1, v0, s[4:5]
; GFX10-DL-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX10-DL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v2, v1, s2
-; GFX10-DL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-DL-NEXT: v_dot8_u32_u4 v1, v2, v1, s0
+; GFX10-DL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %v2addr,
ptr addrspace(1) %dst) {
diff --git a/llvm/test/CodeGen/AMDGPU/imm.ll b/llvm/test/CodeGen/AMDGPU/imm.ll
index f7a0e296fa173..66e54aa88a8a4 100644
--- a/llvm/test/CodeGen/AMDGPU/imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/imm.ll
@@ -17,13 +17,13 @@ define amdgpu_kernel void @i64_imm_inline_lo(ptr addrspace(1) %out) {
;
; VI-LABEL: i64_imm_inline_lo:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 5
; VI-NEXT: v_mov_b32_e32 v1, 0x12345678
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
entry:
store i64 1311768464867721221, ptr addrspace(1) %out ; 0x1234567800000005
@@ -45,13 +45,13 @@ define amdgpu_kernel void @i64_imm_inline_hi(ptr addrspace(1) %out) {
;
; VI-LABEL: i64_imm_inline_hi:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0x12345678
; VI-NEXT: v_mov_b32_e32 v1, 5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
entry:
store i64 21780256376, ptr addrspace(1) %out ; 0x0000000512345678
@@ -72,13 +72,13 @@ define amdgpu_kernel void @store_imm_neg_0.0_i64(ptr addrspace(1) %out) {
;
; VI-LABEL: store_imm_neg_0.0_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_bfrev_b32_e32 v1, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store i64 -9223372036854775808, ptr addrspace(1) %out
ret void
@@ -97,12 +97,12 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i32(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_neg_0.0_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_bfrev_b32_e32 v0, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store i32 -2147483648, ptr addrspace(1) %out
ret void
@@ -121,12 +121,12 @@ define amdgpu_kernel void @store_inline_imm_0.0_f32(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_0.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store float 0.0, ptr addrspace(1) %out
ret void
@@ -145,12 +145,12 @@ define amdgpu_kernel void @store_imm_neg_0.0_f32(ptr addrspace(1) %out) {
;
; VI-LABEL: store_imm_neg_0.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_bfrev_b32_e32 v0, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store float -0.0, ptr addrspace(1) %out
ret void
@@ -169,12 +169,12 @@ define amdgpu_kernel void @store_inline_imm_0.5_f32(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_0.5_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0.5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store float 0.5, ptr addrspace(1) %out
ret void
@@ -193,12 +193,12 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f32(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_m_0.5_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, -0.5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store float -0.5, ptr addrspace(1) %out
ret void
@@ -217,12 +217,12 @@ define amdgpu_kernel void @store_inline_imm_1.0_f32(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_1.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 1.0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store float 1.0, ptr addrspace(1) %out
ret void
@@ -241,12 +241,12 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f32(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_m_1.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, -1.0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store float -1.0, ptr addrspace(1) %out
ret void
@@ -265,12 +265,12 @@ define amdgpu_kernel void @store_inline_imm_2.0_f32(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_2.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 2.0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store float 2.0, ptr addrspace(1) %out
ret void
@@ -289,12 +289,12 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f32(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_m_2.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, -2.0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store float -2.0, ptr addrspace(1) %out
ret void
@@ -313,12 +313,12 @@ define amdgpu_kernel void @store_inline_imm_4.0_f32(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_4.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 4.0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store float 4.0, ptr addrspace(1) %out
ret void
@@ -337,12 +337,12 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f32(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_m_4.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, -4.0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store float -4.0, ptr addrspace(1) %out
ret void
@@ -361,12 +361,12 @@ define amdgpu_kernel void @store_inline_imm_inv_2pi_f32(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_inv_2pi_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0.15915494
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store float 0x3FC45F3060000000, ptr addrspace(1) %out
ret void
@@ -385,12 +385,12 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f32(ptr addrspace(1) %out)
;
; VI-LABEL: store_inline_imm_m_inv_2pi_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0xbe22f983
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store float 0xBFC45F3060000000, ptr addrspace(1) %out
ret void
@@ -409,12 +409,12 @@ define amdgpu_kernel void @store_literal_imm_f32(ptr addrspace(1) %out) {
;
; VI-LABEL: store_literal_imm_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0x45800000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
store float 4096.0, ptr addrspace(1) %out
ret void
@@ -434,13 +434,13 @@ define amdgpu_kernel void @add_inline_imm_0.0_f32(ptr addrspace(1) %out, float %
;
; VI-LABEL: add_inline_imm_0.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, 0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, 0.0
store float %y, ptr addrspace(1) %out
@@ -461,13 +461,13 @@ define amdgpu_kernel void @add_inline_imm_0.5_f32(ptr addrspace(1) %out, float %
;
; VI-LABEL: add_inline_imm_0.5_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, 0.5
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, 0.5
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, 0.5
store float %y, ptr addrspace(1) %out
@@ -488,13 +488,13 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f32(ptr addrspace(1) %out, flo
;
; VI-LABEL: add_inline_imm_neg_0.5_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, -0.5
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, -0.5
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, -0.5
store float %y, ptr addrspace(1) %out
@@ -515,13 +515,13 @@ define amdgpu_kernel void @add_inline_imm_1.0_f32(ptr addrspace(1) %out, float %
;
; VI-LABEL: add_inline_imm_1.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, 1.0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, 1.0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, 1.0
store float %y, ptr addrspace(1) %out
@@ -542,13 +542,13 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f32(ptr addrspace(1) %out, flo
;
; VI-LABEL: add_inline_imm_neg_1.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, -1.0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, -1.0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, -1.0
store float %y, ptr addrspace(1) %out
@@ -569,13 +569,13 @@ define amdgpu_kernel void @add_inline_imm_2.0_f32(ptr addrspace(1) %out, float %
;
; VI-LABEL: add_inline_imm_2.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, 2.0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, 2.0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, 2.0
store float %y, ptr addrspace(1) %out
@@ -596,13 +596,13 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f32(ptr addrspace(1) %out, flo
;
; VI-LABEL: add_inline_imm_neg_2.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, -2.0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, -2.0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, -2.0
store float %y, ptr addrspace(1) %out
@@ -623,13 +623,13 @@ define amdgpu_kernel void @add_inline_imm_4.0_f32(ptr addrspace(1) %out, float %
;
; VI-LABEL: add_inline_imm_4.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, 4.0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, 4.0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, 4.0
store float %y, ptr addrspace(1) %out
@@ -650,13 +650,13 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f32(ptr addrspace(1) %out, flo
;
; VI-LABEL: add_inline_imm_neg_4.0_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, -4.0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, -4.0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, -4.0
store float %y, ptr addrspace(1) %out
@@ -684,20 +684,20 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_f32(ptr addrspace(1) %out,
;
; VI-LABEL: commute_add_inline_imm_0.5_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f32_e32 v0, 0.5, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%x = load float, ptr addrspace(1) %in
%y = fadd float %x, 0.5
@@ -726,20 +726,20 @@ define amdgpu_kernel void @commute_add_literal_f32(ptr addrspace(1) %out, ptr ad
;
; VI-LABEL: commute_add_literal_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f32_e32 v0, 0x44800000, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%x = load float, ptr addrspace(1) %in
%y = fadd float %x, 1024.0
@@ -761,13 +761,13 @@ define amdgpu_kernel void @add_inline_imm_1_f32(ptr addrspace(1) %out, float %x)
;
; VI-LABEL: add_inline_imm_1_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, 1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, 1
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, 0x36a0000000000000
store float %y, ptr addrspace(1) %out
@@ -788,13 +788,13 @@ define amdgpu_kernel void @add_inline_imm_2_f32(ptr addrspace(1) %out, float %x)
;
; VI-LABEL: add_inline_imm_2_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, 2
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, 2
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, 0x36b0000000000000
store float %y, ptr addrspace(1) %out
@@ -815,13 +815,13 @@ define amdgpu_kernel void @add_inline_imm_16_f32(ptr addrspace(1) %out, float %x
;
; VI-LABEL: add_inline_imm_16_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, 16
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, 16
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, 0x36e0000000000000
store float %y, ptr addrspace(1) %out
@@ -843,14 +843,14 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f32(ptr addrspace(1) %out, float
;
; VI-LABEL: add_inline_imm_neg_1_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_i32 s4, s4, -1
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_add_i32 s0, s2, -1
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%xbc = bitcast float %x to i32
%y = add i32 %xbc, -1
@@ -874,14 +874,14 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f32(ptr addrspace(1) %out, float
;
; VI-LABEL: add_inline_imm_neg_2_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_i32 s4, s4, -2
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_add_i32 s0, s2, -2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%xbc = bitcast float %x to i32
%y = add i32 %xbc, -2
@@ -905,14 +905,14 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f32(ptr addrspace(1) %out, floa
;
; VI-LABEL: add_inline_imm_neg_16_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_i32 s4, s4, -16
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_add_i32 s0, s2, -16
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%xbc = bitcast float %x to i32
%y = add i32 %xbc, -16
@@ -935,13 +935,13 @@ define amdgpu_kernel void @add_inline_imm_63_f32(ptr addrspace(1) %out, float %x
;
; VI-LABEL: add_inline_imm_63_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, 63
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, 63
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, 0x36ff800000000000
store float %y, ptr addrspace(1) %out
@@ -962,13 +962,13 @@ define amdgpu_kernel void @add_inline_imm_64_f32(ptr addrspace(1) %out, float %x
;
; VI-LABEL: add_inline_imm_64_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_add_f32_e64 v0, s4, 64
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_add_f32_e64 v0, s2, 64
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd float %x, 0x3700000000000000
store float %y, ptr addrspace(1) %out
@@ -990,12 +990,12 @@ define amdgpu_kernel void @add_inline_imm_0.0_f64(ptr addrspace(1) %out, [8 x i3
; VI-LABEL: add_inline_imm_0.0_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], 0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 0.0
store double %y, ptr addrspace(1) %out
@@ -1017,12 +1017,12 @@ define amdgpu_kernel void @add_inline_imm_0.5_f64(ptr addrspace(1) %out, [8 x i3
; VI-LABEL: add_inline_imm_0.5_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], 0.5
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 0.5
store double %y, ptr addrspace(1) %out
@@ -1044,12 +1044,12 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f64(ptr addrspace(1) %out, [8
; VI-LABEL: add_inline_imm_neg_0.5_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], -0.5
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, -0.5
store double %y, ptr addrspace(1) %out
@@ -1071,12 +1071,12 @@ define amdgpu_kernel void @add_inline_imm_1.0_f64(ptr addrspace(1) %out, [8 x i3
; VI-LABEL: add_inline_imm_1.0_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], 1.0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 1.0
store double %y, ptr addrspace(1) %out
@@ -1098,12 +1098,12 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f64(ptr addrspace(1) %out, [8
; VI-LABEL: add_inline_imm_neg_1.0_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], -1.0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, -1.0
store double %y, ptr addrspace(1) %out
@@ -1125,12 +1125,12 @@ define amdgpu_kernel void @add_inline_imm_2.0_f64(ptr addrspace(1) %out, [8 x i3
; VI-LABEL: add_inline_imm_2.0_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], 2.0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 2.0
store double %y, ptr addrspace(1) %out
@@ -1152,12 +1152,12 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f64(ptr addrspace(1) %out, [8
; VI-LABEL: add_inline_imm_neg_2.0_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], -2.0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, -2.0
store double %y, ptr addrspace(1) %out
@@ -1179,12 +1179,12 @@ define amdgpu_kernel void @add_inline_imm_4.0_f64(ptr addrspace(1) %out, [8 x i3
; VI-LABEL: add_inline_imm_4.0_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], 4.0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 4.0
store double %y, ptr addrspace(1) %out
@@ -1206,12 +1206,12 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f64(ptr addrspace(1) %out, [8
; VI-LABEL: add_inline_imm_neg_4.0_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], -4.0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, -4.0
store double %y, ptr addrspace(1) %out
@@ -1235,12 +1235,12 @@ define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(ptr addrspace(1) %out, [8
; VI-LABEL: add_inline_imm_inv_2pi_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], 0.15915494309189532
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 0x3fc45f306dc9c882
store double %y, ptr addrspace(1) %out
@@ -1264,14 +1264,14 @@ define amdgpu_kernel void @add_m_inv_2pi_f64(ptr addrspace(1) %out, [8 x i32], d
; VI-LABEL: add_m_inv_2pi_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x6dc9c882
; VI-NEXT: v_mov_b32_e32 v1, 0xbfc45f30
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 0xbfc45f306dc9c882
store double %y, ptr addrspace(1) %out
@@ -1293,12 +1293,12 @@ define amdgpu_kernel void @add_inline_imm_1_f64(ptr addrspace(1) %out, [8 x i32]
; VI-LABEL: add_inline_imm_1_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], 1
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 0x0000000000000001
store double %y, ptr addrspace(1) %out
@@ -1320,12 +1320,12 @@ define amdgpu_kernel void @add_inline_imm_2_f64(ptr addrspace(1) %out, [8 x i32]
; VI-LABEL: add_inline_imm_2_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], 2
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 0x0000000000000002
store double %y, ptr addrspace(1) %out
@@ -1347,12 +1347,12 @@ define amdgpu_kernel void @add_inline_imm_16_f64(ptr addrspace(1) %out, [8 x i32
; VI-LABEL: add_inline_imm_16_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], 16
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 0x0000000000000010
store double %y, ptr addrspace(1) %out
@@ -1373,13 +1373,13 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f64(ptr addrspace(1) %out, [8 x
;
; VI-LABEL: add_inline_imm_neg_1_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, -1
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 0xffffffffffffffff
store double %y, ptr addrspace(1) %out
@@ -1400,13 +1400,13 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f64(ptr addrspace(1) %out, [8 x
;
; VI-LABEL: add_inline_imm_neg_2_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, -2
; VI-NEXT: v_mov_b32_e32 v1, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 0xfffffffffffffffe
store double %y, ptr addrspace(1) %out
@@ -1427,13 +1427,13 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f64(ptr addrspace(1) %out, [8 x
;
; VI-LABEL: add_inline_imm_neg_16_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, -16
; VI-NEXT: v_mov_b32_e32 v1, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 0xfffffffffffffff0
store double %y, ptr addrspace(1) %out
@@ -1455,12 +1455,12 @@ define amdgpu_kernel void @add_inline_imm_63_f64(ptr addrspace(1) %out, [8 x i32
; VI-LABEL: add_inline_imm_63_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], 63
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 0x000000000000003F
store double %y, ptr addrspace(1) %out
@@ -1482,12 +1482,12 @@ define amdgpu_kernel void @add_inline_imm_64_f64(ptr addrspace(1) %out, [8 x i32
; VI-LABEL: add_inline_imm_64_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], s[2:3], 64
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%y = fadd double %x, 0x0000000000000040
store double %y, ptr addrspace(1) %out
@@ -1508,13 +1508,13 @@ define amdgpu_kernel void @store_inline_imm_0.0_f64(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_0.0_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store double 0.0, ptr addrspace(1) %out
ret void
@@ -1534,13 +1534,13 @@ define amdgpu_kernel void @store_literal_imm_neg_0.0_f64(ptr addrspace(1) %out)
;
; VI-LABEL: store_literal_imm_neg_0.0_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_bfrev_b32_e32 v1, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store double -0.0, ptr addrspace(1) %out
ret void
@@ -1560,13 +1560,13 @@ define amdgpu_kernel void @store_inline_imm_0.5_f64(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_0.5_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0x3fe00000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store double 0.5, ptr addrspace(1) %out
ret void
@@ -1586,13 +1586,13 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f64(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_m_0.5_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0xbfe00000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store double -0.5, ptr addrspace(1) %out
ret void
@@ -1612,13 +1612,13 @@ define amdgpu_kernel void @store_inline_imm_1.0_f64(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_1.0_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0x3ff00000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store double 1.0, ptr addrspace(1) %out
ret void
@@ -1638,13 +1638,13 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f64(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_m_1.0_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0xbff00000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store double -1.0, ptr addrspace(1) %out
ret void
@@ -1664,13 +1664,13 @@ define amdgpu_kernel void @store_inline_imm_2.0_f64(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_2.0_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 2.0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store double 2.0, ptr addrspace(1) %out
ret void
@@ -1690,13 +1690,13 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f64(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_m_2.0_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, -2.0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store double -2.0, ptr addrspace(1) %out
ret void
@@ -1716,13 +1716,13 @@ define amdgpu_kernel void @store_inline_imm_4.0_f64(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_4.0_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0x40100000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store double 4.0, ptr addrspace(1) %out
ret void
@@ -1742,13 +1742,13 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f64(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inline_imm_m_4.0_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0xc0100000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store double -4.0, ptr addrspace(1) %out
ret void
@@ -1768,13 +1768,13 @@ define amdgpu_kernel void @store_inv_2pi_f64(ptr addrspace(1) %out) {
;
; VI-LABEL: store_inv_2pi_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0x6dc9c882
; VI-NEXT: v_mov_b32_e32 v1, 0x3fc45f30
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store double 0x3fc45f306dc9c882, ptr addrspace(1) %out
ret void
@@ -1794,13 +1794,13 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f64(ptr addrspace(1) %out)
;
; VI-LABEL: store_inline_imm_m_inv_2pi_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0x6dc9c882
; VI-NEXT: v_mov_b32_e32 v1, 0xbfc45f30
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store double 0xbfc45f306dc9c882, ptr addrspace(1) %out
ret void
@@ -1820,13 +1820,13 @@ define amdgpu_kernel void @store_literal_imm_f64(ptr addrspace(1) %out) {
;
; VI-LABEL: store_literal_imm_f64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0x40b00000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
store double 4096.0, ptr addrspace(1) %out
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
index 3cabe41afb05a..44e8ae01fd692 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @extract_w_offset_vgpr(ptr addrspace(1) %out) {
; GCN-NEXT: liveins: $vgpr0, $sgpr2_sgpr3
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY killed $vgpr0
- ; GCN-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr2_sgpr3, 36, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
+ ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr2_sgpr3, 36, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
; GCN-NEXT: renamable $sgpr6 = COPY renamable $sgpr1
; GCN-NEXT: renamable $sgpr0 = COPY renamable $sgpr0, implicit killed $sgpr0_sgpr1
; GCN-NEXT: renamable $sgpr4 = S_MOV_B32 61440
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index 18d5c057d156a..2ecc51dbcb81e 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -4,28 +4,28 @@
define amdgpu_kernel void @float4_inselt(ptr addrspace(1) %out, <4 x float> %vec, i32 %sel) {
; GCN-LABEL: float4_inselt:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_dword s2, s[0:1], 0x44
+; GCN-NEXT: s_load_dword s8, s[0:1], 0x44
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_cmp_lg_u32 s2, 3
+; GCN-NEXT: s_cmp_lg_u32 s8, 3
; GCN-NEXT: v_mov_b32_e32 v0, s7
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_cmp_lg_u32 s2, 2
+; GCN-NEXT: s_cmp_lg_u32 s8, 2
; GCN-NEXT: v_cndmask_b32_e32 v3, 1.0, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_cmp_lg_u32 s2, 1
+; GCN-NEXT: s_cmp_lg_u32 s8, 1
; GCN-NEXT: v_cndmask_b32_e32 v2, 1.0, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v0, s5
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_cmp_lg_u32 s2, 0
+; GCN-NEXT: s_cmp_lg_u32 s8, 0
; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v0, vcc
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NEXT: v_mov_b32_e32 v5, s3
; GCN-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc
-; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v4, s2
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
entry:
@@ -37,14 +37,14 @@ entry:
define amdgpu_kernel void @float4_inselt_undef(ptr addrspace(1) %out, i32 %sel) {
; GCN-LABEL: float4_inselt_undef:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 1.0
; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: v_mov_b32_e32 v2, v0
; GCN-NEXT: v_mov_b32_e32 v3, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s3
+; GCN-NEXT: v_mov_b32_e32 v4, s2
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
entry:
@@ -56,24 +56,24 @@ entry:
define amdgpu_kernel void @int4_inselt(ptr addrspace(1) %out, <4 x i32> %vec, i32 %sel) {
; GCN-LABEL: int4_inselt:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_dword s2, s[0:1], 0x44
+; GCN-NEXT: s_load_dword s8, s[0:1], 0x44
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_cmp_lg_u32 s2, 3
-; GCN-NEXT: s_cselect_b32 s3, s7, 1
-; GCN-NEXT: s_cmp_lg_u32 s2, 2
-; GCN-NEXT: s_cselect_b32 s6, s6, 1
-; GCN-NEXT: s_cmp_lg_u32 s2, 1
+; GCN-NEXT: s_cmp_lg_u32 s8, 3
+; GCN-NEXT: s_cselect_b32 s0, s7, 1
+; GCN-NEXT: s_cmp_lg_u32 s8, 2
+; GCN-NEXT: s_cselect_b32 s1, s6, 1
+; GCN-NEXT: s_cmp_lg_u32 s8, 1
; GCN-NEXT: s_cselect_b32 s5, s5, 1
-; GCN-NEXT: s_cmp_lg_u32 s2, 0
-; GCN-NEXT: s_cselect_b32 s2, s4, 1
-; GCN-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: s_cmp_lg_u32 s8, 0
+; GCN-NEXT: s_cselect_b32 s4, s4, 1
+; GCN-NEXT: v_mov_b32_e32 v5, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
-; GCN-NEXT: v_mov_b32_e32 v2, s6
-; GCN-NEXT: v_mov_b32_e32 v3, s3
-; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v2, s1
+; GCN-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NEXT: v_mov_b32_e32 v4, s2
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
entry:
@@ -110,27 +110,27 @@ define amdgpu_kernel void @float8_inselt(ptr addrspace(1) %out, <8 x float> %vec
; GCN-LABEL: float8_inselt:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; GCN-NEXT: s_load_dword s2, s[0:1], 0x64
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dword s12, s[0:1], 0x64
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: s_mov_b32 m0, s2
-; GCN-NEXT: s_add_u32 s2, s0, 16
-; GCN-NEXT: s_addc_u32 s3, s1, 0
; GCN-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NEXT: s_add_u32 s0, s2, 16
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: v_mov_b32_e32 v3, s7
; GCN-NEXT: v_mov_b32_e32 v4, s8
; GCN-NEXT: v_mov_b32_e32 v5, s9
; GCN-NEXT: v_mov_b32_e32 v6, s10
; GCN-NEXT: v_mov_b32_e32 v7, s11
-; GCN-NEXT: v_mov_b32_e32 v9, s3
+; GCN-NEXT: s_mov_b32 m0, s12
+; GCN-NEXT: v_mov_b32_e32 v9, s1
; GCN-NEXT: v_movreld_b32_e32 v0, 1.0
-; GCN-NEXT: v_mov_b32_e32 v8, s2
+; GCN-NEXT: v_mov_b32_e32 v8, s0
; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s3
+; GCN-NEXT: v_mov_b32_e32 v4, s2
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
entry:
@@ -336,56 +336,56 @@ define amdgpu_kernel void @half8_inselt(ptr addrspace(1) %out, <8 x half> %vec,
; GCN-LABEL: half8_inselt:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GCN-NEXT: s_load_dword s2, s[0:1], 0x44
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dword s8, s[0:1], 0x44
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 0x3c00
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s3, s7, 16
-; GCN-NEXT: s_cmp_lg_u32 s2, 7
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_lshr_b32 s0, s7, 16
+; GCN-NEXT: s_cmp_lg_u32 s8, 7
+; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_cmp_lg_u32 s2, 6
+; GCN-NEXT: s_cmp_lg_u32 s8, 6
; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v2, s7
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_lshr_b32 s3, s6, 16
+; GCN-NEXT: s_lshr_b32 s0, s6, 16
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
-; GCN-NEXT: s_cmp_lg_u32 s2, 5
+; GCN-NEXT: s_cmp_lg_u32 s8, 5
; GCN-NEXT: v_or_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_cmp_lg_u32 s2, 4
+; GCN-NEXT: s_cmp_lg_u32 s8, 4
; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_lshr_b32 s3, s5, 16
+; GCN-NEXT: s_lshr_b32 s0, s5, 16
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
-; GCN-NEXT: s_cmp_lg_u32 s2, 3
+; GCN-NEXT: s_cmp_lg_u32 s8, 3
; GCN-NEXT: v_or_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_cmp_lg_u32 s2, 2
+; GCN-NEXT: s_cmp_lg_u32 s8, 2
; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v4, s5
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_lshr_b32 s3, s4, 16
+; GCN-NEXT: s_lshr_b32 s0, s4, 16
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc
-; GCN-NEXT: s_cmp_lg_u32 s2, 1
+; GCN-NEXT: s_cmp_lg_u32 s8, 1
; GCN-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GCN-NEXT: v_mov_b32_e32 v4, s3
+; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_cmp_lg_u32 s2, 0
+; GCN-NEXT: s_cmp_lg_u32 s8, 0
; GCN-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc
; GCN-NEXT: v_mov_b32_e32 v5, s4
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
; GCN-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GCN-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s3
+; GCN-NEXT: v_mov_b32_e32 v4, s2
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
entry:
@@ -468,98 +468,98 @@ define amdgpu_kernel void @byte16_inselt(ptr addrspace(1) %out, <16 x i8> %vec,
; GCN-LABEL: byte16_inselt:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GCN-NEXT: s_load_dword s2, s[0:1], 0x44
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dword s8, s[0:1], 0x44
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshr_b32 s3, s7, 24
-; GCN-NEXT: s_cmp_lg_u32 s2, 15
-; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: s_lshr_b32 s0, s7, 24
+; GCN-NEXT: s_cmp_lg_u32 s8, 15
+; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_lshr_b32 s3, s7, 16
-; GCN-NEXT: s_cmp_lg_u32 s2, 14
+; GCN-NEXT: s_lshr_b32 s0, s7, 16
+; GCN-NEXT: s_cmp_lg_u32 s8, 14
; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_lshr_b32 s3, s7, 8
+; GCN-NEXT: s_lshr_b32 s0, s7, 8
; GCN-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc
-; GCN-NEXT: s_cmp_lg_u32 s2, 13
+; GCN-NEXT: s_cmp_lg_u32 s8, 13
; GCN-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_cmp_lg_u32 s2, 12
+; GCN-NEXT: s_cmp_lg_u32 s8, 12
; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v2, s7
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GCN-NEXT: v_cndmask_b32_e32 v2, 1, v2, vcc
-; GCN-NEXT: s_lshr_b32 s3, s6, 24
+; GCN-NEXT: s_lshr_b32 s0, s6, 24
; GCN-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT: s_cmp_lg_u32 s2, 11
+; GCN-NEXT: s_cmp_lg_u32 s8, 11
; GCN-NEXT: v_or_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_lshr_b32 s3, s6, 16
-; GCN-NEXT: s_cmp_lg_u32 s2, 10
+; GCN-NEXT: s_lshr_b32 s0, s6, 16
+; GCN-NEXT: s_cmp_lg_u32 s8, 10
; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_lshr_b32 s3, s6, 8
+; GCN-NEXT: s_lshr_b32 s0, s6, 8
; GCN-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc
-; GCN-NEXT: s_cmp_lg_u32 s2, 9
+; GCN-NEXT: s_cmp_lg_u32 s8, 9
; GCN-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_cmp_lg_u32 s2, 8
+; GCN-NEXT: s_cmp_lg_u32 s8, 8
; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GCN-NEXT: v_cndmask_b32_e32 v2, 1, v2, vcc
-; GCN-NEXT: s_lshr_b32 s3, s5, 24
+; GCN-NEXT: s_lshr_b32 s0, s5, 24
; GCN-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT: s_cmp_lg_u32 s2, 7
+; GCN-NEXT: s_cmp_lg_u32 s8, 7
; GCN-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_lshr_b32 s3, s5, 16
-; GCN-NEXT: s_cmp_lg_u32 s2, 6
+; GCN-NEXT: s_lshr_b32 s0, s5, 16
+; GCN-NEXT: s_cmp_lg_u32 s8, 6
; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_lshr_b32 s3, s5, 8
+; GCN-NEXT: s_lshr_b32 s0, s5, 8
; GCN-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc
-; GCN-NEXT: s_cmp_lg_u32 s2, 5
+; GCN-NEXT: s_cmp_lg_u32 s8, 5
; GCN-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_cmp_lg_u32 s2, 4
+; GCN-NEXT: s_cmp_lg_u32 s8, 4
; GCN-NEXT: v_cndmask_b32_e32 v1, 1, v1, vcc
; GCN-NEXT: v_mov_b32_e32 v4, s5
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
; GCN-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; GCN-NEXT: v_cndmask_b32_e32 v4, 1, v4, vcc
-; GCN-NEXT: s_lshr_b32 s3, s4, 24
+; GCN-NEXT: s_lshr_b32 s0, s4, 24
; GCN-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT: s_cmp_lg_u32 s2, 3
+; GCN-NEXT: s_cmp_lg_u32 s8, 3
; GCN-NEXT: v_or_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GCN-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_lshr_b32 s3, s4, 16
-; GCN-NEXT: s_cmp_lg_u32 s2, 2
+; GCN-NEXT: s_lshr_b32 s0, s4, 16
+; GCN-NEXT: s_cmp_lg_u32 s8, 2
; GCN-NEXT: v_cndmask_b32_e32 v0, 1, v0, vcc
-; GCN-NEXT: v_mov_b32_e32 v4, s3
+; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_lshr_b32 s3, s4, 8
+; GCN-NEXT: s_lshr_b32 s0, s4, 8
; GCN-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GCN-NEXT: v_cndmask_b32_e32 v4, 1, v4, vcc
-; GCN-NEXT: s_cmp_lg_u32 s2, 1
+; GCN-NEXT: s_cmp_lg_u32 s8, 1
; GCN-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT: v_mov_b32_e32 v4, s3
+; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
-; GCN-NEXT: s_cmp_lg_u32 s2, 0
+; GCN-NEXT: s_cmp_lg_u32 s8, 0
; GCN-NEXT: v_cndmask_b32_e32 v4, 1, v4, vcc
; GCN-NEXT: v_mov_b32_e32 v5, s4
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
@@ -567,8 +567,8 @@ define amdgpu_kernel void @byte16_inselt(ptr addrspace(1) %out, <16 x i8> %vec,
; GCN-NEXT: v_cndmask_b32_e32 v5, 1, v5, vcc
; GCN-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GCN-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GCN-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s3
+; GCN-NEXT: v_mov_b32_e32 v4, s2
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
entry:
@@ -580,22 +580,22 @@ entry:
define amdgpu_kernel void @double2_inselt(ptr addrspace(1) %out, <2 x double> %vec, i32 %sel) {
; GCN-LABEL: double2_inselt:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_dword s2, s[0:1], 0x44
+; GCN-NEXT: s_load_dword s8, s[0:1], 0x44
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_cmp_eq_u32 s2, 1
-; GCN-NEXT: s_cselect_b32 s3, 0x3ff00000, s7
-; GCN-NEXT: s_cselect_b32 s6, 0, s6
-; GCN-NEXT: s_cmp_eq_u32 s2, 0
-; GCN-NEXT: s_cselect_b32 s2, 0x3ff00000, s5
+; GCN-NEXT: s_cmp_eq_u32 s8, 1
+; GCN-NEXT: s_cselect_b32 s0, 0x3ff00000, s7
+; GCN-NEXT: s_cselect_b32 s1, 0, s6
+; GCN-NEXT: s_cmp_eq_u32 s8, 0
+; GCN-NEXT: s_cselect_b32 s5, 0x3ff00000, s5
; GCN-NEXT: s_cselect_b32 s4, 0, s4
-; GCN-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NEXT: v_mov_b32_e32 v5, s3
; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: v_mov_b32_e32 v1, s2
-; GCN-NEXT: v_mov_b32_e32 v2, s6
-; GCN-NEXT: v_mov_b32_e32 v3, s3
-; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NEXT: v_mov_b32_e32 v2, s1
+; GCN-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NEXT: v_mov_b32_e32 v4, s2
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
entry:
@@ -607,48 +607,48 @@ entry:
define amdgpu_kernel void @double5_inselt(ptr addrspace(1) %out, <5 x double> %vec, i32 %sel) {
; GCN-LABEL: double5_inselt:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_dword s12, s[0:1], 0xa4
-; GCN-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x84
-; GCN-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x24
-; GCN-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x64
+; GCN-NEXT: s_load_dword s14, s[0:1], 0xa4
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x84
+; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x64
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_cmp_eq_u32 s12, 4
-; GCN-NEXT: s_cselect_b32 s9, 0x3ff00000, s9
-; GCN-NEXT: s_cselect_b32 s8, 0, s8
-; GCN-NEXT: s_cmp_eq_u32 s12, 1
+; GCN-NEXT: s_cmp_eq_u32 s14, 4
; GCN-NEXT: s_cselect_b32 s3, 0x3ff00000, s3
; GCN-NEXT: s_cselect_b32 s2, 0, s2
-; GCN-NEXT: s_cmp_eq_u32 s12, 0
-; GCN-NEXT: s_cselect_b32 s13, 0x3ff00000, s1
-; GCN-NEXT: s_cselect_b32 s14, 0, s0
-; GCN-NEXT: s_cmp_eq_u32 s12, 3
-; GCN-NEXT: s_cselect_b32 s0, 0x3ff00000, s7
-; GCN-NEXT: s_cselect_b32 s1, 0, s6
-; GCN-NEXT: s_cmp_eq_u32 s12, 2
+; GCN-NEXT: s_cmp_eq_u32 s14, 1
+; GCN-NEXT: s_cselect_b32 s7, 0x3ff00000, s7
+; GCN-NEXT: s_cselect_b32 s6, 0, s6
+; GCN-NEXT: s_cmp_eq_u32 s14, 0
; GCN-NEXT: s_cselect_b32 s5, 0x3ff00000, s5
; GCN-NEXT: s_cselect_b32 s4, 0, s4
+; GCN-NEXT: s_cmp_eq_u32 s14, 3
+; GCN-NEXT: s_cselect_b32 s0, 0x3ff00000, s11
+; GCN-NEXT: s_cselect_b32 s1, 0, s10
+; GCN-NEXT: s_cmp_eq_u32 s14, 2
+; GCN-NEXT: s_cselect_b32 s9, 0x3ff00000, s9
+; GCN-NEXT: s_cselect_b32 s8, 0, s8
; GCN-NEXT: v_mov_b32_e32 v3, s0
-; GCN-NEXT: s_add_u32 s0, s10, 16
+; GCN-NEXT: s_add_u32 s0, s12, 16
; GCN-NEXT: v_mov_b32_e32 v2, s1
-; GCN-NEXT: s_addc_u32 s1, s11, 0
+; GCN-NEXT: s_addc_u32 s1, s13, 0
; GCN-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NEXT: v_mov_b32_e32 v1, s9
; GCN-NEXT: v_mov_b32_e32 v4, s0
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NEXT: v_mov_b32_e32 v4, s10
-; GCN-NEXT: s_add_u32 s0, s10, 32
-; GCN-NEXT: v_mov_b32_e32 v0, s14
-; GCN-NEXT: v_mov_b32_e32 v1, s13
-; GCN-NEXT: v_mov_b32_e32 v2, s2
-; GCN-NEXT: v_mov_b32_e32 v3, s3
-; GCN-NEXT: v_mov_b32_e32 v5, s11
-; GCN-NEXT: s_addc_u32 s1, s11, 0
+; GCN-NEXT: v_mov_b32_e32 v4, s12
+; GCN-NEXT: s_add_u32 s0, s12, 32
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NEXT: v_mov_b32_e32 v2, s6
+; GCN-NEXT: v_mov_b32_e32 v3, s7
+; GCN-NEXT: v_mov_b32_e32 v5, s13
+; GCN-NEXT: s_addc_u32 s1, s13, 0
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mov_b32_e32 v3, s1
-; GCN-NEXT: v_mov_b32_e32 v0, s8
-; GCN-NEXT: v_mov_b32_e32 v1, s9
+; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GCN-NEXT: s_endpgm
@@ -661,12 +661,12 @@ entry:
define amdgpu_kernel void @double8_inselt(ptr addrspace(1) %out, <8 x double> %vec, i32 %sel) {
; GCN-LABEL: double8_inselt:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_dword s2, s[0:1], 0xa4
+; GCN-NEXT: s_load_dword s20, s[0:1], 0xa4
; GCN-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v16, 0x3ff00000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_lshl_b32 s2, s2, 1
+; GCN-NEXT: s_lshl_b32 s0, s20, 1
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: v_mov_b32_e32 v2, s6
@@ -683,29 +683,29 @@ define amdgpu_kernel void @double8_inselt(ptr addrspace(1) %out, <8 x double> %v
; GCN-NEXT: v_mov_b32_e32 v13, s17
; GCN-NEXT: v_mov_b32_e32 v14, s18
; GCN-NEXT: v_mov_b32_e32 v15, s19
-; GCN-NEXT: s_mov_b32 m0, s2
-; GCN-NEXT: s_add_u32 s2, s0, 48
+; GCN-NEXT: s_mov_b32 m0, s0
+; GCN-NEXT: s_add_u32 s0, s2, 48
; GCN-NEXT: v_movreld_b32_e32 v0, 0
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: v_movreld_b32_e32 v1, v16
-; GCN-NEXT: v_mov_b32_e32 v17, s3
-; GCN-NEXT: v_mov_b32_e32 v16, s2
-; GCN-NEXT: s_add_u32 s2, s0, 32
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: v_mov_b32_e32 v17, s1
+; GCN-NEXT: v_mov_b32_e32 v16, s0
+; GCN-NEXT: s_add_u32 s0, s2, 32
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v13, s3
-; GCN-NEXT: v_mov_b32_e32 v12, s2
-; GCN-NEXT: s_add_u32 s2, s0, 16
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: v_mov_b32_e32 v13, s1
+; GCN-NEXT: v_mov_b32_e32 v12, s0
+; GCN-NEXT: s_add_u32 s0, s2, 16
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v9, s3
-; GCN-NEXT: v_mov_b32_e32 v8, s2
+; GCN-NEXT: v_mov_b32_e32 v9, s1
+; GCN-NEXT: v_mov_b32_e32 v8, s0
; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s3
+; GCN-NEXT: v_mov_b32_e32 v4, s2
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
entry:
@@ -773,11 +773,12 @@ define amdgpu_kernel void @double16_inselt(ptr addrspace(1) %out, <16 x double>
; GCN-NEXT: s_load_dword s2, s[0:1], 0x124
; GCN-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0xa4
; GCN-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0xe4
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v32, 0x3ff00000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s36
; GCN-NEXT: s_lshl_b32 s2, s2, 1
+; GCN-NEXT: s_mov_b32 m0, s2
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, s37
; GCN-NEXT: v_mov_b32_e32 v2, s38
; GCN-NEXT: v_mov_b32_e32 v3, s39
@@ -809,53 +810,53 @@ define amdgpu_kernel void @double16_inselt(ptr addrspace(1) %out, <16 x double>
; GCN-NEXT: v_mov_b32_e32 v29, s17
; GCN-NEXT: v_mov_b32_e32 v30, s18
; GCN-NEXT: v_mov_b32_e32 v31, s19
-; GCN-NEXT: s_mov_b32 m0, s2
-; GCN-NEXT: s_add_u32 s2, s0, 0x70
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_add_u32 s0, s2, 0x70
; GCN-NEXT: v_movreld_b32_e32 v0, 0
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: v_movreld_b32_e32 v1, v32
-; GCN-NEXT: v_mov_b32_e32 v33, s3
-; GCN-NEXT: v_mov_b32_e32 v32, s2
-; GCN-NEXT: s_add_u32 s2, s0, 0x60
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: v_mov_b32_e32 v33, s1
+; GCN-NEXT: v_mov_b32_e32 v32, s0
+; GCN-NEXT: s_add_u32 s0, s2, 0x60
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: flat_store_dwordx4 v[32:33], v[28:31]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v29, s3
-; GCN-NEXT: v_mov_b32_e32 v28, s2
-; GCN-NEXT: s_add_u32 s2, s0, 0x50
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: v_mov_b32_e32 v29, s1
+; GCN-NEXT: v_mov_b32_e32 v28, s0
+; GCN-NEXT: s_add_u32 s0, s2, 0x50
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: flat_store_dwordx4 v[28:29], v[24:27]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v25, s3
-; GCN-NEXT: v_mov_b32_e32 v24, s2
-; GCN-NEXT: s_add_u32 s2, s0, 64
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: v_mov_b32_e32 v25, s1
+; GCN-NEXT: v_mov_b32_e32 v24, s0
+; GCN-NEXT: s_add_u32 s0, s2, 64
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: flat_store_dwordx4 v[24:25], v[20:23]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v21, s3
-; GCN-NEXT: v_mov_b32_e32 v20, s2
-; GCN-NEXT: s_add_u32 s2, s0, 48
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: v_mov_b32_e32 v21, s1
+; GCN-NEXT: v_mov_b32_e32 v20, s0
+; GCN-NEXT: s_add_u32 s0, s2, 48
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v17, s3
-; GCN-NEXT: v_mov_b32_e32 v16, s2
-; GCN-NEXT: s_add_u32 s2, s0, 32
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: v_mov_b32_e32 v17, s1
+; GCN-NEXT: v_mov_b32_e32 v16, s0
+; GCN-NEXT: s_add_u32 s0, s2, 32
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v13, s3
-; GCN-NEXT: v_mov_b32_e32 v12, s2
-; GCN-NEXT: s_add_u32 s2, s0, 16
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: v_mov_b32_e32 v13, s1
+; GCN-NEXT: v_mov_b32_e32 v12, s0
+; GCN-NEXT: s_add_u32 s0, s2, 16
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v9, s3
-; GCN-NEXT: v_mov_b32_e32 v8, s2
+; GCN-NEXT: v_mov_b32_e32 v9, s1
+; GCN-NEXT: v_mov_b32_e32 v8, s0
; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: v_mov_b32_e32 v5, s3
+; GCN-NEXT: v_mov_b32_e32 v4, s2
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
entry:
@@ -875,12 +876,14 @@ define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double>
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: s_load_dword s4, s[0:1], 0x124
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v28, s2
+; GCN-NEXT: v_mov_b32_e32 v29, s3
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: v_mov_b32_e32 v2, s6
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_lshl_b32 s2, s4, 1
+; GCN-NEXT: s_mov_b32 m0, s2
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v3, s7
; GCN-NEXT: v_mov_b32_e32 v4, s8
; GCN-NEXT: v_mov_b32_e32 v5, s9
@@ -906,49 +909,48 @@ define amdgpu_kernel void @double15_inselt(ptr addrspace(1) %out, <15 x double>
; GCN-NEXT: v_mov_b32_e32 v25, s21
; GCN-NEXT: v_mov_b32_e32 v26, s22
; GCN-NEXT: v_mov_b32_e32 v27, s23
-; GCN-NEXT: v_mov_b32_e32 v29, s3
-; GCN-NEXT: s_mov_b32 m0, s2
; GCN-NEXT: v_movreld_b32_e32 v0, 0
-; GCN-NEXT: s_add_u32 s2, s0, 0x50
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_add_u32 s0, s2, 0x50
; GCN-NEXT: v_movreld_b32_e32 v1, v32
-; GCN-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NEXT: v_mov_b32_e32 v31, s3
-; GCN-NEXT: v_mov_b32_e32 v30, s2
-; GCN-NEXT: s_add_u32 s2, s0, 64
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NEXT: v_mov_b32_e32 v31, s1
+; GCN-NEXT: v_mov_b32_e32 v30, s0
+; GCN-NEXT: s_add_u32 s0, s2, 64
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: flat_store_dwordx4 v[30:31], v[20:23]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v21, s3
-; GCN-NEXT: v_mov_b32_e32 v20, s2
-; GCN-NEXT: s_add_u32 s2, s0, 48
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: v_mov_b32_e32 v21, s1
+; GCN-NEXT: v_mov_b32_e32 v20, s0
+; GCN-NEXT: s_add_u32 s0, s2, 48
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v17, s3
-; GCN-NEXT: v_mov_b32_e32 v16, s2
-; GCN-NEXT: s_add_u32 s2, s0, 32
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: v_mov_b32_e32 v17, s1
+; GCN-NEXT: v_mov_b32_e32 v16, s0
+; GCN-NEXT: s_add_u32 s0, s2, 32
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v13, s3
-; GCN-NEXT: v_mov_b32_e32 v12, s2
-; GCN-NEXT: s_add_u32 s2, s0, 16
-; GCN-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NEXT: v_mov_b32_e32 v13, s1
+; GCN-NEXT: v_mov_b32_e32 v12, s0
+; GCN-NEXT: s_add_u32 s0, s2, 16
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
; GCN-NEXT: s_nop 0
-; GCN-NEXT: v_mov_b32_e32 v9, s3
-; GCN-NEXT: v_mov_b32_e32 v8, s2
+; GCN-NEXT: v_mov_b32_e32 v9, s1
+; GCN-NEXT: v_mov_b32_e32 v8, s0
; GCN-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
-; GCN-NEXT: s_add_u32 s2, s0, 0x70
-; GCN-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: s_add_u32 s0, s2, 0x70
+; GCN-NEXT: v_mov_b32_e32 v5, s3
+; GCN-NEXT: v_mov_b32_e32 v4, s2
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: s_add_u32 s0, s0, 0x60
+; GCN-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: s_add_u32 s0, s2, 0x60
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[28:29]
-; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: flat_store_dwordx4 v[0:1], v[24:27]
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 68427e8937bb9..eb7c587d46d7e 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1741,20 +1741,20 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, p
; VI-LABEL: s_dynamic_insertelement_v8i8:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; VI-NEXT: s_load_dword s8, s[4:5], 0x10
+; VI-NEXT: s_load_dword s10, s[4:5], 0x10
; VI-NEXT: s_mov_b32 s7, 0x1100f000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; VI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x0
; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_lshl_b32 s0, s8, 3
+; VI-NEXT: s_lshl_b32 s0, s10, 3
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_lshl_b64 s[0:1], 0xff, s0
-; VI-NEXT: s_and_b32 s9, s1, 0x5050505
+; VI-NEXT: s_and_b32 s3, s1, 0x5050505
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1]
-; VI-NEXT: s_and_b32 s8, s0, 0x5050505
-; VI-NEXT: s_or_b64 s[0:1], s[8:9], s[2:3]
+; VI-NEXT: s_andn2_b64 s[8:9], s[8:9], s[0:1]
+; VI-NEXT: s_and_b32 s2, s0, 0x5050505
+; VI-NEXT: s_or_b64 s[0:1], s[2:3], s[8:9]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
index 13134601cc33d..e351b6d2e3c8d 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
@@ -907,12 +907,12 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: .LBB7_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
@@ -936,12 +936,12 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: .LBB7_2:
; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_readfirstlane_b32 s2, v1
+; GFX90A-NEXT: v_readfirstlane_b32 s0, v1
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
-; GFX90A-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX90A-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX90A-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX90A-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_endpgm
;
@@ -966,12 +966,12 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs
; GFX10-NEXT: .LBB7_2:
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10-NEXT: v_readfirstlane_b32 s0, v1
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_endpgm
;
@@ -995,12 +995,12 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs
; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-FLATSCR-NEXT: .LBB7_2:
; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-FLATSCR-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX9-FLATSCR-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-FLATSCR-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX9-FLATSCR-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: s_endpgm
;
@@ -1025,13 +1025,13 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: .LBB7_2:
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11-NEXT: v_readfirstlane_b32 s0, v1
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_endpgm
;
@@ -1056,13 +1056,13 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: global_inv scope:SCOPE_SE
; GFX12-NEXT: .LBB7_2:
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12-NEXT: v_readfirstlane_b32 s0, v1
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX12-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_endpgm
%val = atomicrmw volatile add ptr addrspace(3) %local, i32 5 seq_cst
@@ -1095,12 +1095,12 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: .LBB8_2:
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX9-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
@@ -1123,12 +1123,12 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: .LBB8_2:
; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_readfirstlane_b32 s2, v1
+; GFX90A-NEXT: v_readfirstlane_b32 s0, v1
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
-; GFX90A-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX90A-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX90A-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX90A-NEXT: global_store_dword v2, v0, s[2:3]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_endpgm
;
@@ -1151,12 +1151,12 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10-NEXT: .LBB8_2:
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_readfirstlane_b32 s2, v1
+; GFX10-NEXT: v_readfirstlane_b32 s0, v1
; GFX10-NEXT: v_mov_b32_e32 v1, 0
-; GFX10-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX10-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_endpgm
;
@@ -1179,12 +1179,12 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: .LBB8_2:
; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s2, v1
+; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-FLATSCR-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX9-FLATSCR-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-FLATSCR-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX9-FLATSCR-NEXT: global_store_dword v2, v0, s[2:3]
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX9-FLATSCR-NEXT: s_endpgm
;
@@ -1208,13 +1208,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: .LBB8_2:
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_readfirstlane_b32 s2, v1
+; GFX11-NEXT: v_readfirstlane_b32 s0, v1
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_endpgm
;
@@ -1238,13 +1238,13 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: .LBB8_2:
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_readfirstlane_b32 s2, v1
+; GFX12-NEXT: v_readfirstlane_b32 s0, v1
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_mad_u32_u24 v0, v0, 5, s2
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: v_mad_u32_u24 v0, v0, 5, s0
+; GFX12-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index a344128d94fcf..aab7b57d66723 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -20,13 +20,13 @@ define amdgpu_kernel void @i8_arg(ptr addrspace(1) nocapture %out, i8 %in) nounw
;
; VI-LABEL: i8_arg:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0xff
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0xff
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -92,13 +92,13 @@ define amdgpu_kernel void @i8_zext_arg(ptr addrspace(1) nocapture %out, i8 zeroe
;
; VI-LABEL: i8_zext_arg:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0xff
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0xff
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -167,13 +167,13 @@ define amdgpu_kernel void @i8_sext_arg(ptr addrspace(1) nocapture %out, i8 signe
;
; VI-LABEL: i8_sext_arg:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_sext_i32_i8 s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_sext_i32_i8 s0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -242,13 +242,13 @@ define amdgpu_kernel void @i16_arg(ptr addrspace(1) nocapture %out, i16 %in) nou
;
; VI-LABEL: i16_arg:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0xffff
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0xffff
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -314,13 +314,13 @@ define amdgpu_kernel void @i16_zext_arg(ptr addrspace(1) nocapture %out, i16 zer
;
; VI-LABEL: i16_zext_arg:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0xffff
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 0xffff
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -389,13 +389,13 @@ define amdgpu_kernel void @i16_sext_arg(ptr addrspace(1) nocapture %out, i16 sig
;
; VI-LABEL: i16_sext_arg:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_sext_i32_i16 s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_sext_i32_i16 s0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -858,18 +858,18 @@ define amdgpu_kernel void @v3i8_arg(ptr addrspace(1) nocapture %out, <3 x i8> %i
;
; VI-LABEL: v3i8_arg:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s3, s2, 16
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_add_u32 s0, s0, 2
-; VI-NEXT: s_addc_u32 s1, s1, 0
+; VI-NEXT: s_lshr_b32 s1, s4, 16
+; VI-NEXT: s_add_u32 s0, s2, 2
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: flat_store_byte v[2:3], v5
; VI-NEXT: flat_store_short v[0:1], v4
; VI-NEXT: s_endpgm
@@ -1118,13 +1118,13 @@ define amdgpu_kernel void @v3i32_arg(ptr addrspace(1) nocapture %out, <3 x i32>
; VI-LABEL: v3i32_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: v_mov_b32_e32 v4, s3
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-NEXT: s_endpgm
;
@@ -1197,13 +1197,13 @@ define amdgpu_kernel void @v3f32_arg(ptr addrspace(1) nocapture %out, <3 x float
; VI-LABEL: v3f32_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: v_mov_b32_e32 v4, s3
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-NEXT: s_endpgm
;
@@ -1396,15 +1396,15 @@ define amdgpu_kernel void @v4i32_arg(ptr addrspace(1) nocapture %out, <4 x i32>
;
; VI-LABEL: v4i32_arg:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -1470,15 +1470,15 @@ define amdgpu_kernel void @v4f32_arg(ptr addrspace(1) nocapture %out, <4 x float
;
; VI-LABEL: v4f32_arg:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -1688,19 +1688,19 @@ define amdgpu_kernel void @v5i16_arg(ptr addrspace(1) nocapture %out, <5 x i16>
; VI-LABEL: v5i16_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VI-NEXT: s_load_dword s5, s[0:1], 0x3c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dword s6, s[0:1], 0x3c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s4, s2, 8
-; VI-NEXT: v_mov_b32_e32 v4, s5
-; VI-NEXT: s_addc_u32 s5, s3, 0
-; VI-NEXT: v_mov_b32_e32 v2, s4
-; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: s_add_u32 s0, s2, 8
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v4, s6
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: flat_store_short v[2:3], v4
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
@@ -1920,22 +1920,22 @@ define amdgpu_kernel void @v5i32_arg(ptr addrspace(1) nocapture %out, <5 x i32>
;
; VI-LABEL: v5i32_arg:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; VI-NEXT: s_load_dword s7, s[0:1], 0x54
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_load_dword s8, s[0:1], 0x54
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s6, s4, 16
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_addc_u32 s7, s5, 0
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: s_add_u32 s0, s2, 16
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v2, s8
; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: flat_store_dword v[0:1], v2
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -2018,22 +2018,22 @@ define amdgpu_kernel void @v5f32_arg(ptr addrspace(1) nocapture %out, <5 x float
;
; VI-LABEL: v5f32_arg:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; VI-NEXT: s_load_dword s7, s[0:1], 0x54
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x44
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_load_dword s8, s[0:1], 0x54
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x44
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s6, s4, 16
-; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: s_addc_u32 s7, s5, 0
-; VI-NEXT: v_mov_b32_e32 v1, s6
-; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_add_u32 s0, s2, 16
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: v_mov_b32_e32 v2, s1
+; VI-NEXT: v_mov_b32_e32 v3, s8
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: flat_store_dword v[1:2], v3
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -2124,32 +2124,32 @@ define amdgpu_kernel void @v5i64_arg(ptr addrspace(1) nocapture %out, <5 x i64>
;
; VI-LABEL: v5i64_arg:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x84
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x64
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x84
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x64
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s12, s8, 32
-; VI-NEXT: v_mov_b32_e32 v1, s10
-; VI-NEXT: s_addc_u32 s13, s9, 0
-; VI-NEXT: v_mov_b32_e32 v3, s12
-; VI-NEXT: v_mov_b32_e32 v2, s11
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v4, s13
-; VI-NEXT: s_add_u32 s4, s8, 16
+; VI-NEXT: s_add_u32 s0, s2, 32
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: v_mov_b32_e32 v1, s12
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: s_add_u32 s0, s2, 16
+; VI-NEXT: v_mov_b32_e32 v2, s13
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_addc_u32 s5, s9, 0
-; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: v_mov_b32_e32 v4, s8
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_mov_b32_e32 v5, s9
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -2266,32 +2266,32 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl
;
; VI-LABEL: v5f64_arg:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x84
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x64
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x84
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x64
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_add_u32 s12, s8, 32
-; VI-NEXT: v_mov_b32_e32 v1, s10
-; VI-NEXT: s_addc_u32 s13, s9, 0
-; VI-NEXT: v_mov_b32_e32 v3, s12
-; VI-NEXT: v_mov_b32_e32 v2, s11
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v4, s13
-; VI-NEXT: s_add_u32 s4, s8, 16
+; VI-NEXT: s_add_u32 s0, s2, 32
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: v_mov_b32_e32 v1, s12
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: s_add_u32 s0, s2, 16
+; VI-NEXT: v_mov_b32_e32 v2, s13
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: s_addc_u32 s5, s9, 0
-; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: v_mov_b32_e32 v4, s8
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_mov_b32_e32 v5, s9
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -2649,15 +2649,15 @@ define amdgpu_kernel void @v8i16_arg(ptr addrspace(1) %out, <8 x i16> %in) {
;
; VI-LABEL: v8i16_arg:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -2904,23 +2904,23 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
; VI-LABEL: v8i32_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s8
-; VI-NEXT: s_add_u32 s2, s0, 16
-; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: s_add_u32 s0, s2, 16
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s10
; VI-NEXT: v_mov_b32_e32 v3, s11
-; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -3015,23 +3015,23 @@ define amdgpu_kernel void @v8f32_arg(ptr addrspace(1) nocapture %out, <8 x float
; VI-LABEL: v8f32_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s8
-; VI-NEXT: s_add_u32 s2, s0, 16
-; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: s_add_u32 s0, s2, 16
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s10
; VI-NEXT: v_mov_b32_e32 v3, s11
-; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -3120,15 +3120,15 @@ define amdgpu_kernel void @v16i8_arg(ptr addrspace(1) %out, <16 x i8> %in) {
;
; VI-LABEL: v16i8_arg:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -3577,23 +3577,23 @@ define amdgpu_kernel void @v16i16_arg(ptr addrspace(1) %out, <16 x i16> %in) {
; VI-LABEL: v16i16_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s8
-; VI-NEXT: s_add_u32 s2, s0, 16
-; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: s_add_u32 s0, s2, 16
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s10
; VI-NEXT: v_mov_b32_e32 v3, s11
-; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -4045,41 +4045,41 @@ define amdgpu_kernel void @v16i32_arg(ptr addrspace(1) nocapture %out, <16 x i32
; VI-LABEL: v16i32_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s16
-; VI-NEXT: s_add_u32 s2, s0, 48
-; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s3
-; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: s_add_u32 s2, s0, 32
+; VI-NEXT: s_add_u32 s0, s2, 48
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: s_add_u32 s0, s2, 32
; VI-NEXT: v_mov_b32_e32 v1, s17
; VI-NEXT: v_mov_b32_e32 v2, s18
; VI-NEXT: v_mov_b32_e32 v3, s19
-; VI-NEXT: s_addc_u32 s3, s1, 0
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: v_mov_b32_e32 v5, s3
-; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: s_add_u32 s2, s0, 16
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: s_add_u32 s0, s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s12
; VI-NEXT: v_mov_b32_e32 v1, s13
; VI-NEXT: v_mov_b32_e32 v2, s14
; VI-NEXT: v_mov_b32_e32 v3, s15
-; VI-NEXT: s_addc_u32 s3, s1, 0
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s10
; VI-NEXT: v_mov_b32_e32 v3, s11
-; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -4233,41 +4233,41 @@ define amdgpu_kernel void @v16f32_arg(ptr addrspace(1) nocapture %out, <16 x flo
; VI-LABEL: v16f32_arg:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx16 s[4:19], s[0:1], 0x64
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s16
-; VI-NEXT: s_add_u32 s2, s0, 48
-; VI-NEXT: s_addc_u32 s3, s1, 0
-; VI-NEXT: v_mov_b32_e32 v5, s3
-; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: s_add_u32 s2, s0, 32
+; VI-NEXT: s_add_u32 s0, s2, 48
+; VI-NEXT: s_addc_u32 s1, s3, 0
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: s_add_u32 s0, s2, 32
; VI-NEXT: v_mov_b32_e32 v1, s17
; VI-NEXT: v_mov_b32_e32 v2, s18
; VI-NEXT: v_mov_b32_e32 v3, s19
-; VI-NEXT: s_addc_u32 s3, s1, 0
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: v_mov_b32_e32 v5, s3
-; VI-NEXT: v_mov_b32_e32 v4, s2
-; VI-NEXT: s_add_u32 s2, s0, 16
+; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: s_add_u32 s0, s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s12
; VI-NEXT: v_mov_b32_e32 v1, s13
; VI-NEXT: v_mov_b32_e32 v2, s14
; VI-NEXT: v_mov_b32_e32 v3, s15
-; VI-NEXT: s_addc_u32 s3, s1, 0
+; VI-NEXT: s_addc_u32 s1, s3, 0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s10
; VI-NEXT: v_mov_b32_e32 v3, s11
-; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm
;
@@ -4401,12 +4401,12 @@ define amdgpu_kernel void @kernel_arg_i64(ptr addrspace(1) %out, i64 %a) nounwin
;
; VI-LABEL: kernel_arg_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
@@ -4463,12 +4463,12 @@ define amdgpu_kernel void @f64_kernel_arg(ptr addrspace(1) %out, double %in) {
;
; VI-LABEL: f64_kernel_arg:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
@@ -4652,13 +4652,13 @@ define amdgpu_kernel void @i1_arg(ptr addrspace(1) %out, i1 %x) nounwind {
;
; VI-LABEL: i1_arg:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 1
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_byte v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -4743,13 +4743,13 @@ define amdgpu_kernel void @i1_arg_zext_i32(ptr addrspace(1) %out, i1 %x) nounwin
;
; VI-LABEL: i1_arg_zext_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 1
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_and_b32 s0, s4, 1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -4816,14 +4816,14 @@ define amdgpu_kernel void @i1_arg_zext_i64(ptr addrspace(1) %out, i1 %x) nounwin
;
; VI-LABEL: i1_arg_zext_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 1
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_and_b32 s0, s4, 1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -4891,13 +4891,13 @@ define amdgpu_kernel void @i1_arg_sext_i32(ptr addrspace(1) %out, i1 %x) nounwin
;
; VI-LABEL: i1_arg_sext_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bfe_i32 s2, s2, 0x10000
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_bfe_i32 s0, s4, 0x10000
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -4967,13 +4967,13 @@ define amdgpu_kernel void @i1_arg_sext_i64(ptr addrspace(1) %out, i1 %x) nounwin
; VI-LABEL: i1_arg_sext_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x10000
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
@@ -5089,25 +5089,25 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32,
;
; VI-LABEL: struct_argument_alignment:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x24
+; VI-NEXT: s_load_dword s6, s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
-; VI-NEXT: s_load_dword s5, s[0:1], 0x3c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
+; VI-NEXT: s_load_dword s7, s[0:1], 0x3c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x44
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
@@ -5254,14 +5254,14 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0,
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: s_load_dword s2, s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x28
+; VI-NEXT: s_load_dword s4, s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x28
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v7, s2
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v4, s0
+; VI-NEXT: v_mov_b32_e32 v7, s4
+; VI-NEXT: v_mov_b32_e32 v5, s3
+; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: flat_store_dword v[2:3], v7
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dwordx2 v[2:3], v[4:5]
@@ -5413,32 +5413,32 @@ define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8,
;
; VI-LABEL: struct_argument_alignment_after:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s8, s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
-; VI-NEXT: s_load_dword s9, s[0:1], 0x3c
-; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x44
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54
+; VI-NEXT: s_load_dword s10, s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; VI-NEXT: s_load_dword s11, s[0:1], 0x3c
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x54
; VI-NEXT: v_mov_b32_e32 v4, 0
; VI-NEXT: v_mov_b32_e32 v5, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v0, s10
; VI-NEXT: flat_store_dword v[4:5], v0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s9
+; VI-NEXT: v_mov_b32_e32 v0, s11
; VI-NEXT: flat_store_dword v[4:5], v0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
@@ -5902,12 +5902,12 @@ define amdgpu_kernel void @byref_align_constant_i32_arg(ptr addrspace(1) nocaptu
; VI-LABEL: byref_align_constant_i32_arg:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x124
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x124
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v3
diff --git a/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll b/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll
index 142a6ed19daf8..1f14da1641d75 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll
@@ -80,25 +80,25 @@ define amdgpu_kernel void @lds_ds_fmin(ptr addrspace(5) %out, ptr addrspace(3) %
; VI: ; %bb.0:
; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s90, -1
; VI-NEXT: s_mov_b32 s91, 0xe80000
; VI-NEXT: s_add_u32 s88, s88, s3
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_addc_u32 s89, s89, 0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b32 s3, s2, 3
; VI-NEXT: v_mov_b32_e32 v0, 0x42280000
-; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_lshl_b32 s0, s4, 3
+; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: ds_min_rtn_f32 v1, v1, v0 offset:32
-; VI-NEXT: s_lshl_b32 s2, s2, 4
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_lshl_b32 s0, s4, 4
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: ds_min_f32 v2, v0 offset:64
-; VI-NEXT: v_mov_b32_e32 v0, s1
+; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: s_waitcnt lgkmcnt(1)
; VI-NEXT: ds_min_rtn_f32 v0, v0, v1
-; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen
; VI-NEXT: s_endpgm
@@ -159,20 +159,20 @@ define amdgpu_kernel void @lds_ds_fmin(ptr addrspace(5) %out, ptr addrspace(3) %
; GFX11-LABEL: lds_ds_fmin:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshl_b32 s3, s2, 3
+; GFX11-NEXT: s_lshl_b32 s0, s4, 3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0x42280000 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: s_lshl_b32 s2, s2, 4
-; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-NEXT: v_dual_mov_b32 v0, 0x42280000 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: s_lshl_b32 s0, s4, 4
+; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: ds_min_rtn_f32 v1, v1, v0 offset:32
; GFX11-NEXT: ds_min_f32 v2, v0 offset:64
; GFX11-NEXT: s_waitcnt lgkmcnt(1)
; GFX11-NEXT: ds_min_rtn_f32 v0, v3, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-NEXT: scratch_store_b32 off, v0, s2
; GFX11-NEXT: s_endpgm
;
; G_SI-LABEL: lds_ds_fmin:
@@ -235,26 +235,26 @@ define amdgpu_kernel void @lds_ds_fmin(ptr addrspace(5) %out, ptr addrspace(3) %
; G_VI: ; %bb.0:
; G_VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; G_VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; G_VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; G_VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; G_VI-NEXT: s_mov_b32 s90, -1
; G_VI-NEXT: s_mov_b32 s91, 0xe80000
; G_VI-NEXT: s_add_u32 s88, s88, s3
+; G_VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; G_VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; G_VI-NEXT: s_addc_u32 s89, s89, 0
-; G_VI-NEXT: s_waitcnt lgkmcnt(0)
-; G_VI-NEXT: s_add_i32 s2, s2, 4
-; G_VI-NEXT: s_lshl_b32 s3, s2, 3
; G_VI-NEXT: v_mov_b32_e32 v0, 0x42280000
-; G_VI-NEXT: v_mov_b32_e32 v1, s3
; G_VI-NEXT: s_mov_b32 m0, -1
+; G_VI-NEXT: s_waitcnt lgkmcnt(0)
+; G_VI-NEXT: s_add_i32 s4, s4, 4
+; G_VI-NEXT: s_lshl_b32 s0, s4, 3
+; G_VI-NEXT: v_mov_b32_e32 v1, s0
; G_VI-NEXT: ds_min_rtn_f32 v1, v1, v0
-; G_VI-NEXT: s_lshl_b32 s2, s2, 4
-; G_VI-NEXT: v_mov_b32_e32 v2, s2
+; G_VI-NEXT: s_lshl_b32 s0, s4, 4
+; G_VI-NEXT: v_mov_b32_e32 v2, s0
; G_VI-NEXT: ds_min_f32 v2, v0
-; G_VI-NEXT: v_mov_b32_e32 v0, s1
+; G_VI-NEXT: v_mov_b32_e32 v0, s3
; G_VI-NEXT: s_waitcnt lgkmcnt(1)
; G_VI-NEXT: ds_min_rtn_f32 v0, v0, v1
-; G_VI-NEXT: v_mov_b32_e32 v1, s0
+; G_VI-NEXT: v_mov_b32_e32 v1, s2
; G_VI-NEXT: s_waitcnt lgkmcnt(0)
; G_VI-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen
; G_VI-NEXT: s_endpgm
@@ -289,49 +289,48 @@ define amdgpu_kernel void @lds_ds_fmin(ptr addrspace(5) %out, ptr addrspace(3) %
; G_GFX10-LABEL: lds_ds_fmin:
; G_GFX10: ; %bb.0:
; G_GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c
-; G_GFX10-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; G_GFX10-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; G_GFX10-NEXT: s_mov_b32 s6, -1
-; G_GFX10-NEXT: s_mov_b32 s7, 0x31c16000
-; G_GFX10-NEXT: s_add_u32 s4, s4, s3
-; G_GFX10-NEXT: s_addc_u32 s5, s5, 0
-; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; G_GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; G_GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; G_GFX10-NEXT: s_mov_b32 s10, -1
+; G_GFX10-NEXT: s_mov_b32 s11, 0x31c16000
+; G_GFX10-NEXT: s_add_u32 s8, s8, s3
+; G_GFX10-NEXT: s_addc_u32 s9, s9, 0
; G_GFX10-NEXT: v_mov_b32_e32 v1, 0x42280000
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT: s_add_i32 s2, s2, 4
-; G_GFX10-NEXT: s_lshl_b32 s3, s2, 3
-; G_GFX10-NEXT: s_lshl_b32 s2, s2, 4
-; G_GFX10-NEXT: v_mov_b32_e32 v0, s3
-; G_GFX10-NEXT: v_mov_b32_e32 v2, s2
-; G_GFX10-NEXT: v_mov_b32_e32 v3, s1
+; G_GFX10-NEXT: s_add_i32 s4, s2, 4
+; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; G_GFX10-NEXT: s_lshl_b32 s5, s4, 3
+; G_GFX10-NEXT: s_lshl_b32 s0, s4, 4
+; G_GFX10-NEXT: v_mov_b32_e32 v0, s5
+; G_GFX10-NEXT: v_mov_b32_e32 v2, s0
; G_GFX10-NEXT: ds_min_rtn_f32 v0, v0, v1
+; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; G_GFX10-NEXT: v_mov_b32_e32 v3, s3
; G_GFX10-NEXT: ds_min_f32 v2, v1
-; G_GFX10-NEXT: s_waitcnt lgkmcnt(1)
; G_GFX10-NEXT: ds_min_rtn_f32 v0, v3, v0
-; G_GFX10-NEXT: v_mov_b32_e32 v1, s0
+; G_GFX10-NEXT: v_mov_b32_e32 v1, s2
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
+; G_GFX10-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
; G_GFX10-NEXT: s_endpgm
;
; G_GFX11-LABEL: lds_ds_fmin:
; G_GFX11: ; %bb.0:
-; G_GFX11-NEXT: s_clause 0x1
; G_GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; G_GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; G_GFX11-NEXT: v_mov_b32_e32 v1, 0x42280000
; G_GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT: s_add_i32 s2, s2, 4
-; G_GFX11-NEXT: v_mov_b32_e32 v3, s1
-; G_GFX11-NEXT: s_lshl_b32 s3, s2, 3
-; G_GFX11-NEXT: s_lshl_b32 s2, s2, 4
-; G_GFX11-NEXT: v_mov_b32_e32 v0, s3
-; G_GFX11-NEXT: v_mov_b32_e32 v2, s2
+; G_GFX11-NEXT: s_add_i32 s4, s2, 4
+; G_GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; G_GFX11-NEXT: s_lshl_b32 s5, s4, 3
+; G_GFX11-NEXT: s_lshl_b32 s0, s4, 4
+; G_GFX11-NEXT: v_mov_b32_e32 v0, s5
+; G_GFX11-NEXT: v_mov_b32_e32 v2, s0
; G_GFX11-NEXT: ds_min_rtn_f32 v0, v0, v1
+; G_GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; G_GFX11-NEXT: v_mov_b32_e32 v3, s3
; G_GFX11-NEXT: ds_min_f32 v2, v1
-; G_GFX11-NEXT: s_waitcnt lgkmcnt(1)
; G_GFX11-NEXT: ds_min_rtn_f32 v0, v3, v0
; G_GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT: scratch_store_b32 off, v0, s0
+; G_GFX11-NEXT: scratch_store_b32 off, v0, s2
; G_GFX11-NEXT: s_endpgm
%idx.add = add nuw i32 %idx, 4
%shl0 = shl i32 %idx.add, 3
@@ -406,25 +405,25 @@ define amdgpu_kernel void @lds_ds_fmax(ptr addrspace(5) %out, ptr addrspace(3) %
; VI: ; %bb.0:
; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s90, -1
; VI-NEXT: s_mov_b32 s91, 0xe80000
; VI-NEXT: s_add_u32 s88, s88, s3
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_addc_u32 s89, s89, 0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b32 s3, s2, 3
; VI-NEXT: v_mov_b32_e32 v0, 0x42280000
-; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_lshl_b32 s0, s4, 3
+; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: ds_max_rtn_f32 v1, v1, v0 offset:32
-; VI-NEXT: s_lshl_b32 s2, s2, 4
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_lshl_b32 s0, s4, 4
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: ds_max_f32 v2, v0 offset:64
-; VI-NEXT: v_mov_b32_e32 v0, s1
+; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: s_waitcnt lgkmcnt(1)
; VI-NEXT: ds_max_rtn_f32 v0, v0, v1
-; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen
; VI-NEXT: s_endpgm
@@ -485,20 +484,20 @@ define amdgpu_kernel void @lds_ds_fmax(ptr addrspace(5) %out, ptr addrspace(3) %
; GFX11-LABEL: lds_ds_fmax:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshl_b32 s3, s2, 3
+; GFX11-NEXT: s_lshl_b32 s0, s4, 3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0x42280000 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: s_lshl_b32 s2, s2, 4
-; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s2
+; GFX11-NEXT: v_dual_mov_b32 v0, 0x42280000 :: v_dual_mov_b32 v1, s0
+; GFX11-NEXT: s_lshl_b32 s0, s4, 4
+; GFX11-NEXT: v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: ds_max_rtn_f32 v1, v1, v0 offset:32
; GFX11-NEXT: ds_max_f32 v2, v0 offset:64
; GFX11-NEXT: s_waitcnt lgkmcnt(1)
; GFX11-NEXT: ds_max_rtn_f32 v0, v3, v1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-NEXT: scratch_store_b32 off, v0, s2
; GFX11-NEXT: s_endpgm
;
; G_SI-LABEL: lds_ds_fmax:
@@ -561,26 +560,26 @@ define amdgpu_kernel void @lds_ds_fmax(ptr addrspace(5) %out, ptr addrspace(3) %
; G_VI: ; %bb.0:
; G_VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; G_VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; G_VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; G_VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; G_VI-NEXT: s_mov_b32 s90, -1
; G_VI-NEXT: s_mov_b32 s91, 0xe80000
; G_VI-NEXT: s_add_u32 s88, s88, s3
+; G_VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; G_VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; G_VI-NEXT: s_addc_u32 s89, s89, 0
-; G_VI-NEXT: s_waitcnt lgkmcnt(0)
-; G_VI-NEXT: s_add_i32 s2, s2, 4
-; G_VI-NEXT: s_lshl_b32 s3, s2, 3
; G_VI-NEXT: v_mov_b32_e32 v0, 0x42280000
-; G_VI-NEXT: v_mov_b32_e32 v1, s3
; G_VI-NEXT: s_mov_b32 m0, -1
+; G_VI-NEXT: s_waitcnt lgkmcnt(0)
+; G_VI-NEXT: s_add_i32 s4, s4, 4
+; G_VI-NEXT: s_lshl_b32 s0, s4, 3
+; G_VI-NEXT: v_mov_b32_e32 v1, s0
; G_VI-NEXT: ds_max_rtn_f32 v1, v1, v0
-; G_VI-NEXT: s_lshl_b32 s2, s2, 4
-; G_VI-NEXT: v_mov_b32_e32 v2, s2
+; G_VI-NEXT: s_lshl_b32 s0, s4, 4
+; G_VI-NEXT: v_mov_b32_e32 v2, s0
; G_VI-NEXT: ds_max_f32 v2, v0
-; G_VI-NEXT: v_mov_b32_e32 v0, s1
+; G_VI-NEXT: v_mov_b32_e32 v0, s3
; G_VI-NEXT: s_waitcnt lgkmcnt(1)
; G_VI-NEXT: ds_max_rtn_f32 v0, v0, v1
-; G_VI-NEXT: v_mov_b32_e32 v1, s0
+; G_VI-NEXT: v_mov_b32_e32 v1, s2
; G_VI-NEXT: s_waitcnt lgkmcnt(0)
; G_VI-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen
; G_VI-NEXT: s_endpgm
@@ -615,49 +614,48 @@ define amdgpu_kernel void @lds_ds_fmax(ptr addrspace(5) %out, ptr addrspace(3) %
; G_GFX10-LABEL: lds_ds_fmax:
; G_GFX10: ; %bb.0:
; G_GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c
-; G_GFX10-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
-; G_GFX10-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
-; G_GFX10-NEXT: s_mov_b32 s6, -1
-; G_GFX10-NEXT: s_mov_b32 s7, 0x31c16000
-; G_GFX10-NEXT: s_add_u32 s4, s4, s3
-; G_GFX10-NEXT: s_addc_u32 s5, s5, 0
-; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; G_GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; G_GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; G_GFX10-NEXT: s_mov_b32 s10, -1
+; G_GFX10-NEXT: s_mov_b32 s11, 0x31c16000
+; G_GFX10-NEXT: s_add_u32 s8, s8, s3
+; G_GFX10-NEXT: s_addc_u32 s9, s9, 0
; G_GFX10-NEXT: v_mov_b32_e32 v1, 0x42280000
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT: s_add_i32 s2, s2, 4
-; G_GFX10-NEXT: s_lshl_b32 s3, s2, 3
-; G_GFX10-NEXT: s_lshl_b32 s2, s2, 4
-; G_GFX10-NEXT: v_mov_b32_e32 v0, s3
-; G_GFX10-NEXT: v_mov_b32_e32 v2, s2
-; G_GFX10-NEXT: v_mov_b32_e32 v3, s1
+; G_GFX10-NEXT: s_add_i32 s4, s2, 4
+; G_GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; G_GFX10-NEXT: s_lshl_b32 s5, s4, 3
+; G_GFX10-NEXT: s_lshl_b32 s0, s4, 4
+; G_GFX10-NEXT: v_mov_b32_e32 v0, s5
+; G_GFX10-NEXT: v_mov_b32_e32 v2, s0
; G_GFX10-NEXT: ds_max_rtn_f32 v0, v0, v1
+; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; G_GFX10-NEXT: v_mov_b32_e32 v3, s3
; G_GFX10-NEXT: ds_max_f32 v2, v1
-; G_GFX10-NEXT: s_waitcnt lgkmcnt(1)
; G_GFX10-NEXT: ds_max_rtn_f32 v0, v3, v0
-; G_GFX10-NEXT: v_mov_b32_e32 v1, s0
+; G_GFX10-NEXT: v_mov_b32_e32 v1, s2
; G_GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX10-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
+; G_GFX10-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
; G_GFX10-NEXT: s_endpgm
;
; G_GFX11-LABEL: lds_ds_fmax:
; G_GFX11: ; %bb.0:
-; G_GFX11-NEXT: s_clause 0x1
; G_GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; G_GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; G_GFX11-NEXT: v_mov_b32_e32 v1, 0x42280000
; G_GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT: s_add_i32 s2, s2, 4
-; G_GFX11-NEXT: v_mov_b32_e32 v3, s1
-; G_GFX11-NEXT: s_lshl_b32 s3, s2, 3
-; G_GFX11-NEXT: s_lshl_b32 s2, s2, 4
-; G_GFX11-NEXT: v_mov_b32_e32 v0, s3
-; G_GFX11-NEXT: v_mov_b32_e32 v2, s2
+; G_GFX11-NEXT: s_add_i32 s4, s2, 4
+; G_GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; G_GFX11-NEXT: s_lshl_b32 s5, s4, 3
+; G_GFX11-NEXT: s_lshl_b32 s0, s4, 4
+; G_GFX11-NEXT: v_mov_b32_e32 v0, s5
+; G_GFX11-NEXT: v_mov_b32_e32 v2, s0
; G_GFX11-NEXT: ds_max_rtn_f32 v0, v0, v1
+; G_GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; G_GFX11-NEXT: v_mov_b32_e32 v3, s3
; G_GFX11-NEXT: ds_max_f32 v2, v1
-; G_GFX11-NEXT: s_waitcnt lgkmcnt(1)
; G_GFX11-NEXT: ds_max_rtn_f32 v0, v3, v0
; G_GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT: scratch_store_b32 off, v0, s0
+; G_GFX11-NEXT: scratch_store_b32 off, v0, s2
; G_GFX11-NEXT: s_endpgm
%idx.add = add nuw i32 %idx, 4
%shl0 = shl i32 %idx.add, 3
@@ -740,28 +738,28 @@ define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace(
; VI: ; %bb.0:
; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s90, -1
; VI-NEXT: s_mov_b32 s91, 0xe80000
; VI-NEXT: s_add_u32 s88, s88, s3
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_addc_u32 s89, s89, 0
; VI-NEXT: v_mov_b32_e32 v0, 0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b32 s3, s2, 3
; VI-NEXT: v_mov_b32_e32 v1, 0x40450000
-; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_lshl_b32 s0, s4, 3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: s_mov_b32 m0, -1
; VI-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32
-; VI-NEXT: s_lshl_b32 s2, s2, 4
-; VI-NEXT: v_mov_b32_e32 v5, s2
-; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: s_lshl_b32 s0, s4, 4
+; VI-NEXT: v_mov_b32_e32 v5, s0
+; VI-NEXT: v_mov_b32_e32 v4, s3
; VI-NEXT: ds_min_f64 v5, v[0:1] offset:64
; VI-NEXT: s_waitcnt lgkmcnt(1)
; VI-NEXT: ds_min_rtn_f64 v[0:1], v4, v[2:3]
-; VI-NEXT: s_add_i32 s1, s0, 4
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_add_i32 s0, s2, 4
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
; VI-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
@@ -827,22 +825,22 @@ define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace(
; GFX11-LABEL: lds_ds_fmin_f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshl_b32 s3, s2, 3
-; GFX11-NEXT: v_mov_b32_e32 v5, s1
-; GFX11-NEXT: v_dual_mov_b32 v1, 0x40450000 :: v_dual_mov_b32 v2, s3
-; GFX11-NEXT: s_lshl_b32 s2, s2, 4
+; GFX11-NEXT: s_lshl_b32 s0, s4, 3
+; GFX11-NEXT: v_mov_b32_e32 v5, s3
+; GFX11-NEXT: v_dual_mov_b32 v1, 0x40450000 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: s_lshl_b32 s0, s4, 4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v4, s2
+; GFX11-NEXT: v_mov_b32_e32 v4, s0
; GFX11-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32
; GFX11-NEXT: ds_min_f64 v4, v[0:1] offset:64
; GFX11-NEXT: s_waitcnt lgkmcnt(1)
; GFX11-NEXT: ds_min_rtn_f64 v[0:1], v5, v[2:3]
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: scratch_store_b64 off, v[0:1], s0
+; GFX11-NEXT: scratch_store_b64 off, v[0:1], s2
; GFX11-NEXT: s_endpgm
;
; G_SI-LABEL: lds_ds_fmin_f64:
@@ -917,30 +915,30 @@ define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace(
; G_VI: ; %bb.0:
; G_VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; G_VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; G_VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; G_VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; G_VI-NEXT: s_mov_b32 s90, -1
; G_VI-NEXT: s_mov_b32 s91, 0xe80000
; G_VI-NEXT: s_add_u32 s88, s88, s3
-; G_VI-NEXT: s_mov_b32 s2, 0
+; G_VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; G_VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; G_VI-NEXT: s_mov_b32 s0, 0
; G_VI-NEXT: s_addc_u32 s89, s89, 0
-; G_VI-NEXT: s_mov_b32 s3, 0x40450000
+; G_VI-NEXT: s_mov_b32 s1, 0x40450000
; G_VI-NEXT: s_waitcnt lgkmcnt(0)
; G_VI-NEXT: s_add_i32 s4, s4, 4
-; G_VI-NEXT: v_mov_b32_e32 v0, s2
-; G_VI-NEXT: s_lshl_b32 s2, s4, 3
-; G_VI-NEXT: v_mov_b32_e32 v1, s3
-; G_VI-NEXT: v_mov_b32_e32 v2, s2
+; G_VI-NEXT: v_mov_b32_e32 v0, s0
+; G_VI-NEXT: s_lshl_b32 s0, s4, 3
+; G_VI-NEXT: v_mov_b32_e32 v1, s1
+; G_VI-NEXT: v_mov_b32_e32 v2, s0
; G_VI-NEXT: s_mov_b32 m0, -1
; G_VI-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1]
-; G_VI-NEXT: s_lshl_b32 s2, s4, 4
-; G_VI-NEXT: v_mov_b32_e32 v4, s2
+; G_VI-NEXT: s_lshl_b32 s0, s4, 4
+; G_VI-NEXT: v_mov_b32_e32 v4, s0
; G_VI-NEXT: ds_min_f64 v4, v[0:1]
-; G_VI-NEXT: v_mov_b32_e32 v0, s1
+; G_VI-NEXT: v_mov_b32_e32 v0, s3
; G_VI-NEXT: s_waitcnt lgkmcnt(1)
; G_VI-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3]
-; G_VI-NEXT: v_mov_b32_e32 v2, s0
-; G_VI-NEXT: s_add_u32 s0, s0, 4
+; G_VI-NEXT: v_mov_b32_e32 v2, s2
+; G_VI-NEXT: s_add_u32 s0, s2, 4
; G_VI-NEXT: v_mov_b32_e32 v3, s0
; G_VI-NEXT: s_waitcnt lgkmcnt(0)
; G_VI-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
@@ -1013,24 +1011,25 @@ define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace(
; G_GFX11-LABEL: lds_ds_fmin_f64:
; G_GFX11: ; %bb.0:
; G_GFX11-NEXT: s_clause 0x1
-; G_GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; G_GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; G_GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; G_GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; G_GFX11-NEXT: s_mov_b32 s0, 0
+; G_GFX11-NEXT: s_mov_b32 s1, 0x40450000
+; G_GFX11-NEXT: v_mov_b32_e32 v0, s0
; G_GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT: s_add_i32 s4, s2, 4
-; G_GFX11-NEXT: s_mov_b32 s2, 0
-; G_GFX11-NEXT: s_mov_b32 s3, 0x40450000
+; G_GFX11-NEXT: s_add_i32 s4, s4, 4
+; G_GFX11-NEXT: v_mov_b32_e32 v5, s3
; G_GFX11-NEXT: s_lshl_b32 s5, s4, 3
-; G_GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v5, s1
-; G_GFX11-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v2, s5
-; G_GFX11-NEXT: s_lshl_b32 s2, s4, 4
-; G_GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; G_GFX11-NEXT: v_mov_b32_e32 v4, s2
+; G_GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; G_GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s5
+; G_GFX11-NEXT: s_lshl_b32 s0, s4, 4
+; G_GFX11-NEXT: v_mov_b32_e32 v4, s0
; G_GFX11-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1]
; G_GFX11-NEXT: ds_min_f64 v4, v[0:1]
; G_GFX11-NEXT: s_waitcnt lgkmcnt(1)
; G_GFX11-NEXT: ds_min_rtn_f64 v[0:1], v5, v[2:3]
; G_GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT: scratch_store_b64 off, v[0:1], s0
+; G_GFX11-NEXT: scratch_store_b64 off, v[0:1], s2
; G_GFX11-NEXT: s_endpgm
%idx.add = add nuw i32 %idx, 4
%shl0 = shl i32 %idx.add, 3
@@ -1113,28 +1112,28 @@ define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace(
; VI: ; %bb.0:
; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s90, -1
; VI-NEXT: s_mov_b32 s91, 0xe80000
; VI-NEXT: s_add_u32 s88, s88, s3
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_addc_u32 s89, s89, 0
; VI-NEXT: v_mov_b32_e32 v0, 0
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b32 s3, s2, 3
; VI-NEXT: v_mov_b32_e32 v1, 0x40450000
-; VI-NEXT: v_mov_b32_e32 v2, s3
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_lshl_b32 s0, s4, 3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: s_mov_b32 m0, -1
; VI-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32
-; VI-NEXT: s_lshl_b32 s2, s2, 4
-; VI-NEXT: v_mov_b32_e32 v5, s2
-; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: s_lshl_b32 s0, s4, 4
+; VI-NEXT: v_mov_b32_e32 v5, s0
+; VI-NEXT: v_mov_b32_e32 v4, s3
; VI-NEXT: ds_max_f64 v5, v[0:1] offset:64
; VI-NEXT: s_waitcnt lgkmcnt(1)
; VI-NEXT: ds_max_rtn_f64 v[0:1], v4, v[2:3]
-; VI-NEXT: s_add_i32 s1, s0, 4
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_add_i32 s0, s2, 4
+; VI-NEXT: v_mov_b32_e32 v3, s0
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen
; VI-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
@@ -1200,22 +1199,22 @@ define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace(
; GFX11-LABEL: lds_ds_fmax_f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshl_b32 s3, s2, 3
-; GFX11-NEXT: v_mov_b32_e32 v5, s1
-; GFX11-NEXT: v_dual_mov_b32 v1, 0x40450000 :: v_dual_mov_b32 v2, s3
-; GFX11-NEXT: s_lshl_b32 s2, s2, 4
+; GFX11-NEXT: s_lshl_b32 s0, s4, 3
+; GFX11-NEXT: v_mov_b32_e32 v5, s3
+; GFX11-NEXT: v_dual_mov_b32 v1, 0x40450000 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: s_lshl_b32 s0, s4, 4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v4, s2
+; GFX11-NEXT: v_mov_b32_e32 v4, s0
; GFX11-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32
; GFX11-NEXT: ds_max_f64 v4, v[0:1] offset:64
; GFX11-NEXT: s_waitcnt lgkmcnt(1)
; GFX11-NEXT: ds_max_rtn_f64 v[0:1], v5, v[2:3]
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: scratch_store_b64 off, v[0:1], s0
+; GFX11-NEXT: scratch_store_b64 off, v[0:1], s2
; GFX11-NEXT: s_endpgm
;
; G_SI-LABEL: lds_ds_fmax_f64:
@@ -1290,30 +1289,30 @@ define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace(
; G_VI: ; %bb.0:
; G_VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; G_VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; G_VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; G_VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; G_VI-NEXT: s_mov_b32 s90, -1
; G_VI-NEXT: s_mov_b32 s91, 0xe80000
; G_VI-NEXT: s_add_u32 s88, s88, s3
-; G_VI-NEXT: s_mov_b32 s2, 0
+; G_VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; G_VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; G_VI-NEXT: s_mov_b32 s0, 0
; G_VI-NEXT: s_addc_u32 s89, s89, 0
-; G_VI-NEXT: s_mov_b32 s3, 0x40450000
+; G_VI-NEXT: s_mov_b32 s1, 0x40450000
; G_VI-NEXT: s_waitcnt lgkmcnt(0)
; G_VI-NEXT: s_add_i32 s4, s4, 4
-; G_VI-NEXT: v_mov_b32_e32 v0, s2
-; G_VI-NEXT: s_lshl_b32 s2, s4, 3
-; G_VI-NEXT: v_mov_b32_e32 v1, s3
-; G_VI-NEXT: v_mov_b32_e32 v2, s2
+; G_VI-NEXT: v_mov_b32_e32 v0, s0
+; G_VI-NEXT: s_lshl_b32 s0, s4, 3
+; G_VI-NEXT: v_mov_b32_e32 v1, s1
+; G_VI-NEXT: v_mov_b32_e32 v2, s0
; G_VI-NEXT: s_mov_b32 m0, -1
; G_VI-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1]
-; G_VI-NEXT: s_lshl_b32 s2, s4, 4
-; G_VI-NEXT: v_mov_b32_e32 v4, s2
+; G_VI-NEXT: s_lshl_b32 s0, s4, 4
+; G_VI-NEXT: v_mov_b32_e32 v4, s0
; G_VI-NEXT: ds_max_f64 v4, v[0:1]
-; G_VI-NEXT: v_mov_b32_e32 v0, s1
+; G_VI-NEXT: v_mov_b32_e32 v0, s3
; G_VI-NEXT: s_waitcnt lgkmcnt(1)
; G_VI-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3]
-; G_VI-NEXT: v_mov_b32_e32 v2, s0
-; G_VI-NEXT: s_add_u32 s0, s0, 4
+; G_VI-NEXT: v_mov_b32_e32 v2, s2
+; G_VI-NEXT: s_add_u32 s0, s2, 4
; G_VI-NEXT: v_mov_b32_e32 v3, s0
; G_VI-NEXT: s_waitcnt lgkmcnt(0)
; G_VI-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen
@@ -1386,24 +1385,25 @@ define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace(
; G_GFX11-LABEL: lds_ds_fmax_f64:
; G_GFX11: ; %bb.0:
; G_GFX11-NEXT: s_clause 0x1
-; G_GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; G_GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; G_GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; G_GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; G_GFX11-NEXT: s_mov_b32 s0, 0
+; G_GFX11-NEXT: s_mov_b32 s1, 0x40450000
+; G_GFX11-NEXT: v_mov_b32_e32 v0, s0
; G_GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT: s_add_i32 s4, s2, 4
-; G_GFX11-NEXT: s_mov_b32 s2, 0
-; G_GFX11-NEXT: s_mov_b32 s3, 0x40450000
+; G_GFX11-NEXT: s_add_i32 s4, s4, 4
+; G_GFX11-NEXT: v_mov_b32_e32 v5, s3
; G_GFX11-NEXT: s_lshl_b32 s5, s4, 3
-; G_GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v5, s1
-; G_GFX11-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v2, s5
-; G_GFX11-NEXT: s_lshl_b32 s2, s4, 4
-; G_GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; G_GFX11-NEXT: v_mov_b32_e32 v4, s2
+; G_GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; G_GFX11-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s5
+; G_GFX11-NEXT: s_lshl_b32 s0, s4, 4
+; G_GFX11-NEXT: v_mov_b32_e32 v4, s0
; G_GFX11-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1]
; G_GFX11-NEXT: ds_max_f64 v4, v[0:1]
; G_GFX11-NEXT: s_waitcnt lgkmcnt(1)
; G_GFX11-NEXT: ds_max_rtn_f64 v[0:1], v5, v[2:3]
; G_GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; G_GFX11-NEXT: scratch_store_b64 off, v[0:1], s0
+; G_GFX11-NEXT: scratch_store_b64 off, v[0:1], s2
; G_GFX11-NEXT: s_endpgm
%idx.add = add nuw i32 %idx, 4
%shl0 = shl i32 %idx.add, 3
diff --git a/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll b/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll
index e1124f3ba89b5..90623c0d0a522 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @load_zeroinit_lds_global(ptr addrspace(1) %out, i1 %p
; GCN: liveins: $sgpr0_sgpr1
; GCN: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
; GFX6: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0
- ; GFX8: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0
+ ; GFX8: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0
; GFX6: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
; GFX6: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
index 01a1ab41c5cac..2c3e3fa2a730a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
@@ -78,12 +78,12 @@ define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(ptr addrspace(1) %out,
;
; VI-LABEL: s_cvt_pkrtz_samereg_v2f16_f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s4, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -111,12 +111,12 @@ define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(ptr addrspace(1) %out,
; GFX11-LABEL: s_cvt_pkrtz_samereg_v2f16_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, s2, s2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, s4, s4
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -166,14 +166,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addr
; VI-LABEL: v_cvt_pkrtz_v2f16_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -219,12 +219,12 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(ptr addrspace(1) %out, ptr addr
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e32 v1, v1, v2
; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -262,16 +262,16 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out,
;
; VI-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, v3, 1.0
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -279,35 +279,35 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, 1.0
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e64 v1, v1, 1.0
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, v1, 1.0
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -340,16 +340,16 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out,
;
; VI-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, 1.0, v3
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -357,35 +357,35 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, 1.0, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cvt_pkrtz_f16_f32_e32 v1, 1.0, v1
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e32 v1, 1.0, v1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -423,14 +423,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out,
; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -476,12 +476,12 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(ptr addrspace(1) %out,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, -v1, v2
; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -525,14 +525,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out,
; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_hi:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -578,12 +578,12 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(ptr addrspace(1) %out,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, v1, -v2
; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -627,14 +627,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %ou
; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_lo_hi:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -680,12 +680,12 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(ptr addrspace(1) %ou
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, -v1, -v2
; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -730,14 +730,14 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrsp
; VI-LABEL: v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -783,12 +783,12 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(ptr addrsp
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e64 v1, -|v1|, -v2
; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
index 224de9512c493..edd88daa88891 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
@@ -549,8 +549,8 @@ end:
; GCN-LABEL: {{^}}test_export_clustering:
; PREGFX11-DAG: v_mov_b32_e32 [[W0:v[0-9]+]], 0
; PREGFX11-DAG: v_mov_b32_e32 [[W1:v[0-9]+]], 1.0
-; PREGFX11-DAG: v_mov_b32_e32 [[X:v[0-9]+]], s0
-; PREGFX11-DAG: v_mov_b32_e32 [[Y:v[0-9]+]], s1
+; PREGFX11-DAG: v_mov_b32_e32 [[X:v[0-9]+]], s2
+; PREGFX11-DAG: v_mov_b32_e32 [[Y:v[0-9]+]], s3
; PREGFX11-DAG: v_add_f32_e{{32|64}} [[Z0:v[0-9]+]]
; PREGFX11-DAG: v_sub_f32_e{{32|64}} [[Z1:v[0-9]+]]
; PREGFX11: exp param0 [[X]], [[Y]], [[Z0]], [[W0]]{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
index a737c5e7dd265..0567b422aa8b1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
@@ -126,20 +126,20 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) {
;
; GISEL-GFX11-LABEL: v_fcmp_f32:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1]
+; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f32:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[0:1]
+; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[2:3]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 -1)
store i32 %result, ptr addrspace(1) %out
@@ -150,13 +150,13 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_oeq:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_eq_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -176,14 +176,14 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_oeq:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_eq_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_eq_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -208,13 +208,13 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_one:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_neq_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -234,14 +234,14 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_one:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_neq_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -266,13 +266,13 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_ogt:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_lt_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -292,14 +292,14 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_ogt:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_lt_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_lt_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -324,13 +324,13 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_oge:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_le_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -350,14 +350,14 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_oge:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_le_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_le_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -382,13 +382,13 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_olt:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_gt_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -408,14 +408,14 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_olt:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_gt_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_gt_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -440,13 +440,13 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_ole:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_ge_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -466,14 +466,14 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_ole:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ge_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_ge_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -498,13 +498,13 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_o:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_o_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -524,14 +524,14 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_o:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_o_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_o_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -556,13 +556,13 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_uo:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_u_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -582,14 +582,14 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_uo:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_u_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_u_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -614,13 +614,13 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_ueq:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_nlg_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -640,14 +640,14 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_ueq:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_nlg_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_nlg_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -672,13 +672,13 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_une:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_neq_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -698,14 +698,14 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_une:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_neq_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -730,13 +730,13 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_ugt:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_nge_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -756,14 +756,14 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_ugt:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_nge_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_nge_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -788,13 +788,13 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_uge:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_ngt_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -814,14 +814,14 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_uge:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ngt_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_ngt_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -846,13 +846,13 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_ult:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_nle_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -872,14 +872,14 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_ult:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_nle_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_nle_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -904,13 +904,13 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) {
; SDAG-GFX11-LABEL: v_fcmp_f32_ule:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s2
+; SDAG-GFX11-NEXT: v_cmp_nlt_f32_e64 s0, 0x42c80000, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -930,14 +930,14 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) {
; GISEL-GFX11-LABEL: v_fcmp_f32_ule:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_nlt_f32_e64 s2, 0x42c80000, s2
+; GISEL-GFX11-NEXT: v_cmp_nlt_f32_e64 s0, 0x42c80000, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -961,47 +961,47 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) {
define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_oeq:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_eq_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_oeq:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_eq_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_oeq:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_eq_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_oeq:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_eq_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 1)
store i32 %result, ptr addrspace(1) %out
@@ -1011,47 +1011,47 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_one:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_one:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_one:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_one:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 6)
store i32 %result, ptr addrspace(1) %out
@@ -1061,47 +1061,47 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_ogt:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_lt_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_ogt:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_lt_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_ogt:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_lt_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_ogt:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_lt_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 2)
store i32 %result, ptr addrspace(1) %out
@@ -1111,47 +1111,47 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_oge:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_le_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_oge:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_le_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_oge:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_le_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_oge:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_le_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_le_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 3)
store i32 %result, ptr addrspace(1) %out
@@ -1161,47 +1161,47 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_olt:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_gt_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_olt:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_gt_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_olt:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_gt_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_olt:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_gt_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 4)
store i32 %result, ptr addrspace(1) %out
@@ -1211,47 +1211,47 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_ole:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_ge_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_ole:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_ge_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_ole:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_ge_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_ole:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_ge_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 5)
store i32 %result, ptr addrspace(1) %out
@@ -1261,47 +1261,47 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_ueq:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_nlg_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_ueq:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_nlg_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_ueq:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_nlg_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_ueq:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_nlg_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 9)
store i32 %result, ptr addrspace(1) %out
@@ -1311,47 +1311,47 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_o:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_o_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_o:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_o_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_o:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_o_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_o:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_o_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_o_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 7)
store i32 %result, ptr addrspace(1) %out
@@ -1361,47 +1361,47 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_uo:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_u_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_uo:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_u_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_uo:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_u_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_uo:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_u_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_u_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 8)
store i32 %result, ptr addrspace(1) %out
@@ -1411,47 +1411,47 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_une:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_une:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_une:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_une:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_neq_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 14)
store i32 %result, ptr addrspace(1) %out
@@ -1461,47 +1461,47 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_ugt:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_nge_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_ugt:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_nge_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_ugt:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_nge_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_ugt:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_nge_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 10)
store i32 %result, ptr addrspace(1) %out
@@ -1511,47 +1511,47 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_uge:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_ngt_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_uge:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_ngt_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_uge:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_ngt_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_uge:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_ngt_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 11)
store i32 %result, ptr addrspace(1) %out
@@ -1561,47 +1561,47 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_ult:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_nle_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_ult:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_nle_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_ult:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_nle_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_ult:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_nle_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 12)
store i32 %result, ptr addrspace(1) %out
@@ -1611,47 +1611,47 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) {
; SDAG-GFX11-LABEL: v_fcmp_f64_ule:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_nlt_f64_e64 s0, 0x40590000, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_fcmp_f64_ule:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_nlt_f64_e64 s0, 0x40590000, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_fcmp_f64_ule:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_nlt_f64_e64 s0, 0x40590000, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f64_ule:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_nlt_f64_e64 s0, 0x40590000, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 13)
store i32 %result, ptr addrspace(1) %out
@@ -1663,14 +1663,14 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half
; SDAG-GFX11-LABEL: v_fcmp_f16_oeq_with_fabs:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: s_lshr_b32 s3, s2, 16
+; SDAG-GFX11-NEXT: s_lshr_b32 s0, s4, 16
; SDAG-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, s2, |s3|
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s0, s4, |s0|
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1691,15 +1691,15 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half
; GISEL-GFX11-LABEL: v_fcmp_f16_oeq_with_fabs:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: s_lshr_b32 s3, s2, 16
+; GISEL-GFX11-NEXT: s_lshr_b32 s0, s4, 16
; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, s2, |s3|
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s0, s4, |s0|
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -1727,14 +1727,14 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace(
; SDAG-GFX11-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: s_lshr_b32 s3, s2, 16
+; SDAG-GFX11-NEXT: s_lshr_b32 s0, s4, 16
; SDAG-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, |s2|, |s3|
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s0, |s4|, |s0|
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1755,15 +1755,15 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace(
; GISEL-GFX11-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: s_lshr_b32 s3, s2, 16
+; GISEL-GFX11-NEXT: s_lshr_b32 s0, s4, 16
; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, |s2|, |s3|
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s0, |s4|, |s0|
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -1798,20 +1798,20 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) {
;
; GISEL-GFX11-LABEL: v_fcmp_f16:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1]
+; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_fcmp_f16:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[0:1]
+; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[2:3]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 -1)
store i32 %result, ptr addrspace(1) %out
@@ -1823,13 +1823,13 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_oeq:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_eq_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1849,14 +1849,14 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_oeq:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_eq_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -1882,13 +1882,13 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_one:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_neq_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1908,14 +1908,14 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_one:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_neq_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -1941,13 +1941,13 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_ogt:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_lt_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1967,14 +1967,14 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_ogt:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_lt_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_lt_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -2000,13 +2000,13 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_oge:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_le_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -2026,14 +2026,14 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_oge:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_le_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_le_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -2059,13 +2059,13 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_olt:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_gt_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -2085,14 +2085,14 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_olt:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_gt_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_gt_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -2118,13 +2118,13 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_ole:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_ge_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -2144,14 +2144,14 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_ole:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ge_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_ge_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -2177,13 +2177,13 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_ueq:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_nlg_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -2203,14 +2203,14 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_ueq:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_nlg_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_nlg_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -2236,13 +2236,13 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_une:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_neq_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -2262,14 +2262,14 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_une:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_neq_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_neq_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -2295,13 +2295,13 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_ugt:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_nge_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -2321,14 +2321,14 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_ugt:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_nge_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_nge_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -2354,13 +2354,13 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_uge:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_ngt_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -2380,14 +2380,14 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_uge:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ngt_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_ngt_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -2413,13 +2413,13 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_ult:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_nle_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -2439,14 +2439,14 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_ult:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_nle_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_nle_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -2471,13 +2471,13 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_o:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_o_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -2497,14 +2497,14 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_o:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_o_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_o_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -2529,13 +2529,13 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_uo:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_u_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -2555,14 +2555,14 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_uo:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_u_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_u_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -2587,13 +2587,13 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) {
; SDAG-GFX11-LABEL: v_fcmp_f16_ule:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s2
+; SDAG-GFX11-NEXT: v_cmp_nlt_f16_e64 s0, 0x5640, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -2613,14 +2613,14 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) {
; GISEL-GFX11-LABEL: v_fcmp_f16_ule:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_nlt_f16_e64 s2, 0x5640, s2
+; GISEL-GFX11-NEXT: v_cmp_nlt_f16_e64 s0, 0x5640, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
index 7d41cf1c5bcb8..62a007e5029be 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
@@ -137,10 +137,10 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) {
;
; GFX11-GISEL-LABEL: v_fcmp_f32:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: global_store_b64 v0, v[0:1], s[2:3]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -151,10 +151,10 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) {
;
; GFX9-GISEL-LABEL: v_fcmp_f32:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX9-GISEL-NEXT: global_store_dwordx2 v0, v[0:1], s[2:3]
; GFX9-GISEL-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f32:
@@ -163,10 +163,10 @@ define amdgpu_kernel void @v_fcmp_f32(ptr addrspace(1) %out, float %src) {
;
; VI-GISEL-LABEL: v_fcmp_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 -1)
@@ -178,15 +178,15 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_oeq:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_eq_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_eq_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -206,29 +206,29 @@ define amdgpu_kernel void @v_fcmp_f32_oeq(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_oeq:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_eq_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_oeq:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_eq_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 1)
@@ -240,15 +240,15 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_one:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_neq_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_neq_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -268,29 +268,29 @@ define amdgpu_kernel void @v_fcmp_f32_one(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_one:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_one:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 6)
@@ -302,15 +302,15 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_ogt:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_lt_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_lt_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -330,29 +330,29 @@ define amdgpu_kernel void @v_fcmp_f32_ogt(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_ogt:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_gt_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_gt_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_ogt:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_gt_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_gt_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 2)
@@ -364,15 +364,15 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_oge:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_le_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_le_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -392,29 +392,29 @@ define amdgpu_kernel void @v_fcmp_f32_oge(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_oge:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_ge_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_ge_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_oge:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_ge_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_ge_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 3)
@@ -426,15 +426,15 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_olt:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_gt_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -454,29 +454,29 @@ define amdgpu_kernel void @v_fcmp_f32_olt(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_olt:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_lt_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_olt:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_lt_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 4)
@@ -488,15 +488,15 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_ole:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ge_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_ge_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -516,29 +516,29 @@ define amdgpu_kernel void @v_fcmp_f32_ole(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_ole:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_le_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_le_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_ole:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_le_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_le_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 5)
@@ -550,15 +550,15 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_o:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_o_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_o_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -578,29 +578,29 @@ define amdgpu_kernel void @v_fcmp_f32_o(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_o:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_o_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_o_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_o:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_o_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_o_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 7)
@@ -612,15 +612,15 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_uo:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_u_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_u_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -640,29 +640,29 @@ define amdgpu_kernel void @v_fcmp_f32_uo(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_uo:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_u_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_u_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_uo:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_u_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_u_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 8)
@@ -674,15 +674,15 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_ueq:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_nlg_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -702,29 +702,29 @@ define amdgpu_kernel void @v_fcmp_f32_ueq(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_ueq:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_nlg_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_nlg_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_ueq:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_nlg_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_nlg_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 9)
@@ -736,15 +736,15 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_une:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_neq_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_neq_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -764,29 +764,29 @@ define amdgpu_kernel void @v_fcmp_f32_une(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_une:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_neq_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_une:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_neq_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 14)
@@ -798,15 +798,15 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_ugt:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nge_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_nge_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -826,29 +826,29 @@ define amdgpu_kernel void @v_fcmp_f32_ugt(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_ugt:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_nle_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_nle_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_ugt:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_nle_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_nle_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 10)
@@ -860,15 +860,15 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_uge:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ngt_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_ngt_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -888,29 +888,29 @@ define amdgpu_kernel void @v_fcmp_f32_uge(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_uge:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_nlt_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_nlt_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_uge:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_nlt_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_nlt_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 11)
@@ -922,15 +922,15 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_ult:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nle_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_nle_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -950,29 +950,29 @@ define amdgpu_kernel void @v_fcmp_f32_ult(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_ult:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_nge_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_nge_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_ult:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_nge_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_nge_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 12)
@@ -984,15 +984,15 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) {
; GFX11-LABEL: v_fcmp_f32_ule:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlt_f32_e64 s[2:3], 0x42c80000, s2
+; GFX11-NEXT: v_cmp_nlt_f32_e64 s[0:1], 0x42c80000, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1012,29 +1012,29 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) {
;
; VI-SDAG-LABEL: v_fcmp_f32_ule:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_ngt_f32_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_ngt_f32_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f32_ule:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x42c80000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_ngt_f32_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_ngt_f32_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 13)
@@ -1045,56 +1045,56 @@ define amdgpu_kernel void @v_fcmp_f32_ule(ptr addrspace(1) %out, float %src) {
define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_oeq:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_eq_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_eq_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_oeq:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_eq_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_eq_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_oeq:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_eq_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_eq_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_oeq:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_eq_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_eq_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 1)
@@ -1105,56 +1105,56 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_one:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_neq_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_neq_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_one:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_neq_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_one:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_neq_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_one:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_neq_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 6)
@@ -1165,56 +1165,56 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_ogt:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_lt_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_lt_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_ogt:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_gt_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_gt_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_ogt:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_gt_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_gt_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_ogt:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_gt_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_gt_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 2)
@@ -1225,56 +1225,56 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_oge:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_le_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_le_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_oge:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ge_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_ge_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_oge:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_ge_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_ge_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_oge:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_ge_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_ge_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 3)
@@ -1285,56 +1285,56 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_olt:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_gt_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_olt:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_lt_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_lt_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_olt:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_lt_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_lt_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_olt:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_lt_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_lt_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 4)
@@ -1345,56 +1345,56 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_ole:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ge_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_ge_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_ole:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_le_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_le_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_ole:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_le_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_le_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_ole:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_le_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_le_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 5)
@@ -1405,56 +1405,56 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_ueq:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlg_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_nlg_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_ueq:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_nlg_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_nlg_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_ueq:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_nlg_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_nlg_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_ueq:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_nlg_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_nlg_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 9)
@@ -1465,56 +1465,56 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_o:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_o_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_o_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_o:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_o_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_o_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_o:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_o_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_o_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_o:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_o_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_o_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 7)
@@ -1525,56 +1525,56 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_uo:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_u_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_u_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_uo:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_u_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_u_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_uo:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_u_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_u_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_uo:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_u_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_u_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 8)
@@ -1585,56 +1585,56 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_une:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_neq_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_neq_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_une:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_neq_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_une:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_neq_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_une:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_neq_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 14)
@@ -1645,56 +1645,56 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_ugt:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nge_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_nge_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_ugt:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_nle_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_nle_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_ugt:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_nle_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_nle_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_ugt:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_nle_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_nle_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 10)
@@ -1705,56 +1705,56 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_uge:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ngt_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_ngt_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_uge:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_nlt_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_nlt_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_uge:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_nlt_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_nlt_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_uge:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_nlt_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_nlt_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 11)
@@ -1765,56 +1765,56 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_ult:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nle_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_nle_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_ult:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_nge_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_nge_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_ult:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_nge_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_nge_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_ult:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_nge_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_nge_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 12)
@@ -1825,56 +1825,56 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) {
define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) {
; GFX11-LABEL: v_fcmp_f64_ule:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlt_f64_e64 s[2:3], 0x40590000, s[2:3]
+; GFX11-NEXT: v_cmp_nlt_f64_e64 s[0:1], 0x40590000, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX9-LABEL: v_fcmp_f64_ule:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0x40590000
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ngt_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_ngt_f64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f64_ule:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_ngt_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_ngt_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f64_ule:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40590000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_ngt_f64_e64 s[2:3], s[2:3], v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_cmp_ngt_f64_e64 s[0:1], s[6:7], v[0:1]
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s5
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 13)
@@ -1887,17 +1887,17 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half
; GFX11-LABEL: v_fcmp_f16_oeq_with_fabs:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s3, s2, 16
+; GFX11-NEXT: s_lshr_b32 s0, s4, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, |s3|
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: v_cmp_eq_f16_e64 s[0:1], s4, |s0|
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1918,31 +1918,31 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_with_fabs(ptr addrspace(1) %out, half
;
; VI-SDAG-LABEL: v_fcmp_f16_oeq_with_fabs:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_lshr_b32 s3, s2, 16
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3
-; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, |v0|
+; VI-SDAG-NEXT: s_lshr_b32 s0, s4, 16
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[0:1], s4, |v0|
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_oeq_with_fabs:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_lshr_b32 s3, s2, 16
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3
-; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, |v0|
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: s_lshr_b32 s0, s4, 16
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[0:1], s4, |v0|
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%temp = call half @llvm.fabs.f16(half %a)
@@ -1956,17 +1956,17 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace(
; GFX11-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_lshr_b32 s3, s2, 16
+; GFX11-NEXT: s_lshr_b32 s0, s4, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], |s2|, |s3|
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: v_cmp_eq_f16_e64 s[0:1], |s4|, |s0|
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1987,31 +1987,31 @@ define amdgpu_kernel void @v_fcmp_f16_oeq_both_operands_with_fabs(ptr addrspace(
;
; VI-SDAG-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_lshr_b32 s3, s2, 16
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s3
-; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], |s2|, |v0|
+; VI-SDAG-NEXT: s_lshr_b32 s0, s4, 16
; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[0:1], |s4|, |v0|
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_oeq_both_operands_with_fabs:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_lshr_b32 s3, s2, 16
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s3
-; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], |s2|, |v0|
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: s_lshr_b32 s0, s4, 16
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[0:1], |s4|, |v0|
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%temp = call half @llvm.fabs.f16(half %a)
@@ -2028,10 +2028,10 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) {
;
; GFX11-GISEL-LABEL: v_fcmp_f16:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GFX11-GISEL-NEXT: global_store_b64 v0, v[0:1], s[2:3]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -2042,10 +2042,10 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) {
;
; GFX9-GISEL-LABEL: v_fcmp_f16:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX9-GISEL-NEXT: global_store_dwordx2 v0, v[0:1], s[2:3]
; GFX9-GISEL-NEXT: s_endpgm
;
; VI-SDAG-LABEL: v_fcmp_f16:
@@ -2054,10 +2054,10 @@ define amdgpu_kernel void @v_fcmp_f16(ptr addrspace(1) %out, half %src) {
;
; VI-GISEL-LABEL: v_fcmp_f16:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 -1)
@@ -2070,15 +2070,15 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_oeq:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_eq_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_eq_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2098,29 +2098,29 @@ define amdgpu_kernel void @v_fcmp_f16_oeq(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_oeq:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_eq_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_oeq:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_eq_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 1)
@@ -2133,15 +2133,15 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_one:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_neq_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_neq_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2161,29 +2161,29 @@ define amdgpu_kernel void @v_fcmp_f16_one(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_one:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_one:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 6)
@@ -2196,15 +2196,15 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_ogt:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_lt_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_lt_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2224,29 +2224,29 @@ define amdgpu_kernel void @v_fcmp_f16_ogt(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_ogt:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_gt_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_gt_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_ogt:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_gt_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_gt_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 2)
@@ -2259,15 +2259,15 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_oge:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_le_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_le_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2287,29 +2287,29 @@ define amdgpu_kernel void @v_fcmp_f16_oge(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_oge:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_ge_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_ge_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_oge:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_ge_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_ge_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 3)
@@ -2322,15 +2322,15 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_olt:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_gt_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2350,29 +2350,29 @@ define amdgpu_kernel void @v_fcmp_f16_olt(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_olt:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_lt_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_lt_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_olt:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_lt_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_lt_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 4)
@@ -2385,15 +2385,15 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_ole:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ge_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_ge_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2413,29 +2413,29 @@ define amdgpu_kernel void @v_fcmp_f16_ole(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_ole:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_le_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_le_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_ole:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_le_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_le_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 5)
@@ -2448,15 +2448,15 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_ueq:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlg_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_nlg_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2476,29 +2476,29 @@ define amdgpu_kernel void @v_fcmp_f16_ueq(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_ueq:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_nlg_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_nlg_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_ueq:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_nlg_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_nlg_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 9)
@@ -2511,15 +2511,15 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_une:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_neq_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_neq_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2539,29 +2539,29 @@ define amdgpu_kernel void @v_fcmp_f16_une(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_une:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_neq_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_une:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_neq_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 14)
@@ -2574,15 +2574,15 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_ugt:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nge_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_nge_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2602,29 +2602,29 @@ define amdgpu_kernel void @v_fcmp_f16_ugt(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_ugt:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_nle_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_nle_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_ugt:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_nle_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_nle_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 10)
@@ -2637,15 +2637,15 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_uge:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ngt_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_ngt_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2665,29 +2665,29 @@ define amdgpu_kernel void @v_fcmp_f16_uge(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_uge:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_nlt_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_nlt_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_uge:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_nlt_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_nlt_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 11)
@@ -2700,15 +2700,15 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_ult:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nle_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_nle_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2728,29 +2728,29 @@ define amdgpu_kernel void @v_fcmp_f16_ult(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_ult:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_nge_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_nge_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_ult:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_nge_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_nge_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 12)
@@ -2762,15 +2762,15 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_o:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_o_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_o_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2790,29 +2790,29 @@ define amdgpu_kernel void @v_fcmp_f16_o(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_o:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_o_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_o_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_o:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_o_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_o_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 7)
@@ -2824,15 +2824,15 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_uo:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_u_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_u_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2852,29 +2852,29 @@ define amdgpu_kernel void @v_fcmp_f16_uo(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_uo:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_u_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_u_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_uo:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_u_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_u_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 8)
@@ -2886,15 +2886,15 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) {
; GFX11-LABEL: v_fcmp_f16_ule:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlt_f16_e64 s[2:3], 0x5640, s2
+; GFX11-NEXT: v_cmp_nlt_f16_e64 s[0:1], 0x5640, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2914,29 +2914,29 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) {
;
; VI-SDAG-LABEL: v_fcmp_f16_ule:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_ngt_f16_e64 s[2:3], s2, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s3
+; VI-SDAG-NEXT: v_cmp_ngt_f16_e64 s[0:1], s4, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s0
; VI-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: v_fcmp_f16_ule:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x5640
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_ngt_f16_e64 s[2:3], s2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; VI-GISEL-NEXT: v_cmp_ngt_f16_e64 s[0:1], s4, v0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s3
; VI-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-GISEL-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.fcmp.f16(half %src, half 100.00, i32 13)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
index ca06a57be19cc..528d289e1848f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.bf16.bf16.ll
@@ -8,15 +8,15 @@ declare bfloat @llvm.amdgcn.fdot2.bf16.bf16(<2 x bfloat> %a, <2 x bfloat> %b, bf
define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16(
; GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
+; GFX11-NEXT: global_load_u16 v1, v0, s[10:11]
+; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[8:9], 0x0
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dot2_bf16_bf16 v1, s2, s3, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: v_dot2_bf16_bf16 v1, s0, s1, v1
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -34,18 +34,17 @@ entry:
}
define amdgpu_kernel void @test_llvm_amdgcn_fdot2_bf16_bf16_dpp(
-; SDAG-GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_dpp:
-; SDAG-GFX11: ; %bb.0: ; %entry
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: scratch_load_b32 v0, off, s2
-; SDAG-GFX11-NEXT: scratch_load_u16 v1, off, s3
-; SDAG-GFX11-NEXT: scratch_load_b32 v2, off, s1
-; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0)
-; SDAG-GFX11-NEXT: v_dot2_bf16_bf16_e64_dpp v0, v2, v0, v1 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
-; SDAG-GFX11-NEXT: scratch_store_b16 off, v0, s0
-; SDAG-GFX11-NEXT: s_endpgm
-;
+; GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_dpp:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: scratch_load_b32 v0, off, s6
+; GFX11-NEXT: scratch_load_u16 v1, off, s7
+; GFX11-NEXT: scratch_load_b32 v2, off, s5
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_dot2_bf16_bf16_e64_dpp v0, v2, v0, v1 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX11-NEXT: scratch_store_b16 off, v0, s4
+; GFX11-NEXT: s_endpgm
; GISEL-GFX11-LABEL: test_llvm_amdgcn_fdot2_bf16_bf16_dpp:
; GISEL-GFX11: ; %bb.0: ; %entry
; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
@@ -95,3 +94,5 @@ entry:
}
declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; SDAG-GFX11: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
index 99c3deaada8c6..7edf3d6c03690 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f16.f16.ll
@@ -7,15 +7,15 @@ declare half @llvm.amdgcn.fdot2.f16.f16(<2 x half> %a, <2 x half> %b, half %c)
define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16(
; GFX11-LABEL: test_llvm_amdgcn_fdot2_f16_f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
+; GFX11-NEXT: global_load_u16 v1, v0, s[10:11]
+; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[8:9], 0x0
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dot2_f16_f16 v1, s2, s3, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: v_dot2_f16_f16 v1, s0, s1, v1
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -35,26 +35,26 @@ entry:
define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f16_f16_dpp(
; SDAG-GFX11-LABEL: test_llvm_amdgcn_fdot2_f16_f16_dpp:
; SDAG-GFX11: ; %bb.0: ; %entry
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: scratch_load_b32 v0, off, s2
-; SDAG-GFX11-NEXT: scratch_load_u16 v1, off, s3
-; SDAG-GFX11-NEXT: scratch_load_b32 v2, off, s1
+; SDAG-GFX11-NEXT: scratch_load_b32 v0, off, s6
+; SDAG-GFX11-NEXT: scratch_load_u16 v1, off, s7
+; SDAG-GFX11-NEXT: scratch_load_b32 v2, off, s5
; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0)
; SDAG-GFX11-NEXT: v_dot2_f16_f16_e64_dpp v0, v2, v0, v1 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
-; SDAG-GFX11-NEXT: scratch_store_b16 off, v0, s0
+; SDAG-GFX11-NEXT: scratch_store_b16 off, v0, s4
; SDAG-GFX11-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: test_llvm_amdgcn_fdot2_f16_f16_dpp:
; GISEL-GFX11: ; %bb.0: ; %entry
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: scratch_load_b32 v0, off, s1
-; GISEL-GFX11-NEXT: scratch_load_b32 v1, off, s2
-; GISEL-GFX11-NEXT: scratch_load_u16 v2, off, s3
+; GISEL-GFX11-NEXT: scratch_load_b32 v0, off, s5
+; GISEL-GFX11-NEXT: scratch_load_b32 v1, off, s6
+; GISEL-GFX11-NEXT: scratch_load_u16 v2, off, s7
; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0)
; GISEL-GFX11-NEXT: v_dot2_f16_f16_e64_dpp v0, v0, v1, v2 quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
-; GISEL-GFX11-NEXT: scratch_store_b16 off, v0, s0
+; GISEL-GFX11-NEXT: scratch_store_b16 off, v0, s4
; GISEL-GFX11-NEXT: s_endpgm
ptr addrspace(5) %r,
ptr addrspace(5) %a,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
index e51b1d2da2e41..40c6925c6fa3f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
@@ -7,16 +7,16 @@ declare float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, floa
define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_clamp(
; GFX11-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_clamp:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s6, s[6:7], 0x0
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[10:11], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0
+; GFX11-NEXT: s_load_b32 s2, s[8:9], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0 clamp
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_dot2_f32_bf16 v0, s1, s2, v0 clamp
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -37,16 +37,16 @@ entry:
define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_no_clamp(
; GFX11-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_no_clamp:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s6, s[6:7], 0x0
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[10:11], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0
+; GFX11-NEXT: s_load_b32 s2, s[8:9], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_dot2_f32_bf16 v0, s1, s2, v0
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll
index 434fa1bf7b340..690362ce6d0bc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.ordered.add.b64.ll
@@ -7,20 +7,20 @@ declare i64 @llvm.amdgcn.global.atomic.ordered.add.b64(ptr addrspace(1), i64)
define amdgpu_kernel void @global_atomic_ordered_add_b64_no_rtn(ptr addrspace(1) %addr, i64 %in) {
; GFX12-SDAG-LABEL: global_atomic_ordered_add_b64_no_rtn:
; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-SDAG-NEXT: global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[0:1] offset:-32 th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6
+; GFX12-SDAG-NEXT: global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[4:5] offset:-32 th:TH_ATOMIC_RETURN
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: global_atomic_ordered_add_b64_no_rtn:
; GFX12-GISEL: ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-GISEL-NEXT: global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[0:1] offset:-32 th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-GISEL-NEXT: global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[4:5] offset:-32 th:TH_ATOMIC_RETURN
; GFX12-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 -4
@@ -31,14 +31,15 @@ entry:
define amdgpu_kernel void @global_atomic_ordered_add_b64_rtn(ptr addrspace(1) %addr, i64 %in, ptr addrspace(1) %use) {
; GFX12-SDAG-LABEL: global_atomic_ordered_add_b64_rtn:
; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v0, s6
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s7
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, s6
; GFX12-SDAG-NEXT: global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-SDAG-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -47,13 +48,13 @@ define amdgpu_kernel void @global_atomic_ordered_add_b64_rtn(ptr addrspace(1) %a
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
; GFX12-GISEL-NEXT: global_atomic_ordered_add_b64 v[0:1], v2, v[0:1], s[4:5] offset:32 th:TH_ATOMIC_RETURN
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-GISEL-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
index f6197e0770213..c2eb77177f845 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
@@ -8,12 +8,12 @@ declare <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1))
define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
; GFX12-LABEL: global_load_tr_b64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_tr_b64 v[0:1], v2, s[0:1] offset:32
+; GFX12-NEXT: global_load_tr_b64 v[0:1], v2, s[4:5] offset:32
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -27,12 +27,12 @@ entry:
define amdgpu_kernel void @global_load_tr_b128(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
; GFX12-LABEL: global_load_tr_b128:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32
+; GFX12-NEXT: global_load_tr_b128 v[0:3], v4, s[4:5] offset:32
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[2:3]
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
index a2dc3662fcc48..96835c68b2d29 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
@@ -8,12 +8,12 @@ declare <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16.p1(ptr addrspace(1))
define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
; GFX12-LABEL: global_load_tr_b64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_tr_b64 v1, v0, s[0:1] offset:32
+; GFX12-NEXT: global_load_tr_b64 v1, v0, s[4:5] offset:32
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX12-NEXT: global_store_b32 v0, v1, s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -27,12 +27,12 @@ entry:
define amdgpu_kernel void @global_load_tr_b128(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
; GFX12-LABEL: global_load_tr_b128:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32
+; GFX12-NEXT: global_load_tr_b128 v[0:1], v2, s[4:5] offset:32
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
index ae61b58e95ac7..1e1ea1057005d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
@@ -22,13 +22,13 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) {
; SDAG-GFX11-LABEL: v_icmp_i32_eq:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -48,14 +48,14 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) {
; GISEL-GFX11-LABEL: v_icmp_i32_eq:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -87,20 +87,20 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) {
;
; GISEL-GFX11-LABEL: v_icmp_i32:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1]
+; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_icmp_i32:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[0:1]
+; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[2:3]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 30)
store i32 %result, ptr addrspace(1) %out
@@ -111,13 +111,13 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) {
; SDAG-GFX11-LABEL: v_icmp_i32_ne:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -137,14 +137,14 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) {
; GISEL-GFX11-LABEL: v_icmp_i32_ne:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ne_u32_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -169,13 +169,13 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) {
; SDAG-GFX11-LABEL: v_icmp_i32_ugt:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_lt_u32_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -195,14 +195,14 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) {
; GISEL-GFX11-LABEL: v_icmp_i32_ugt:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_lt_u32_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_lt_u32_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -227,13 +227,13 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) {
; SDAG-GFX11-LABEL: v_icmp_i32_uge:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_le_u32_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_le_u32_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -253,14 +253,14 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) {
; GISEL-GFX11-LABEL: v_icmp_i32_uge:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_le_u32_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_le_u32_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -285,13 +285,13 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) {
; SDAG-GFX11-LABEL: v_icmp_i32_ult:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_gt_u32_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -311,14 +311,14 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) {
; GISEL-GFX11-LABEL: v_icmp_i32_ult:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_gt_u32_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_gt_u32_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -343,13 +343,13 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) {
; SDAG-GFX11-LABEL: v_icmp_i32_ule:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_ge_u32_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -369,14 +369,14 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) {
; GISEL-GFX11-LABEL: v_icmp_i32_ule:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ge_u32_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_ge_u32_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -401,13 +401,13 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 {
; SDAG-GFX11-LABEL: v_icmp_i32_sgt:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_lt_i32_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -427,14 +427,14 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 {
; GISEL-GFX11-LABEL: v_icmp_i32_sgt:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_lt_i32_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_lt_i32_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -459,13 +459,13 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) {
; SDAG-GFX11-LABEL: v_icmp_i32_sge:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_le_i32_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_le_i32_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -485,14 +485,14 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) {
; GISEL-GFX11-LABEL: v_icmp_i32_sge:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_le_i32_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_le_i32_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -517,13 +517,13 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) {
; SDAG-GFX11-LABEL: v_icmp_i32_slt:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_gt_i32_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -543,14 +543,14 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) {
; GISEL-GFX11-LABEL: v_icmp_i32_slt:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_gt_i32_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_gt_i32_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -575,13 +575,13 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) {
; SDAG-GFX11-LABEL: v_icmp_i32_sle:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_ge_i32_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -601,14 +601,14 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) {
; GISEL-GFX11-LABEL: v_icmp_i32_sle:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ge_i32_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_ge_i32_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -632,47 +632,47 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) {
define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) {
; SDAG-GFX11-LABEL: v_icmp_i64_eq:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_eq_u64_e64 s0, 0x64, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_icmp_i64_eq:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_eq_u64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_icmp_i64_eq:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_eq_u64_e64 s0, 0x64, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_icmp_i64_eq:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_eq_u64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_eq_u64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 32)
store i32 %result, ptr addrspace(1) %out
@@ -682,47 +682,47 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) {
; SDAG-GFX11-LABEL: v_icmp_i64_ne:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_ne_u64_e64 s0, 0x64, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_icmp_i64_ne:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_ne_u64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_icmp_i64_ne:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_ne_u64_e64 s0, 0x64, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_icmp_i64_ne:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_ne_u64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_ne_u64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 33)
store i32 %result, ptr addrspace(1) %out
@@ -732,47 +732,47 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) {
; SDAG-GFX11-LABEL: v_icmp_u64_ugt:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_lt_u64_e64 s0, 0x64, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_icmp_u64_ugt:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_lt_u64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_icmp_u64_ugt:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_lt_u64_e64 s0, 0x64, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_icmp_u64_ugt:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_lt_u64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_lt_u64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 34)
store i32 %result, ptr addrspace(1) %out
@@ -782,47 +782,47 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) {
; SDAG-GFX11-LABEL: v_icmp_u64_uge:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_le_u64_e64 s0, 0x64, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_icmp_u64_uge:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_le_u64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_icmp_u64_uge:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_le_u64_e64 s0, 0x64, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_icmp_u64_uge:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_le_u64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_le_u64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 35)
store i32 %result, ptr addrspace(1) %out
@@ -832,47 +832,47 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) {
; SDAG-GFX11-LABEL: v_icmp_u64_ult:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_gt_u64_e64 s0, 0x64, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_icmp_u64_ult:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_gt_u64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_icmp_u64_ult:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_gt_u64_e64 s0, 0x64, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_icmp_u64_ult:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_gt_u64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_gt_u64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 36)
store i32 %result, ptr addrspace(1) %out
@@ -882,47 +882,47 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) {
; SDAG-GFX11-LABEL: v_icmp_u64_ule:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_ge_u64_e64 s0, 0x64, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_icmp_u64_ule:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_ge_u64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_icmp_u64_ule:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_ge_u64_e64 s0, 0x64, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_icmp_u64_ule:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_ge_u64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_ge_u64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 37)
store i32 %result, ptr addrspace(1) %out
@@ -932,47 +932,47 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) {
; SDAG-GFX11-LABEL: v_icmp_i64_sgt:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0x64, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_icmp_i64_sgt:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_lt_i64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_icmp_i64_sgt:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0x64, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_icmp_i64_sgt:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_lt_i64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_lt_i64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 38)
store i32 %result, ptr addrspace(1) %out
@@ -982,47 +982,47 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) {
; SDAG-GFX11-LABEL: v_icmp_i64_sge:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_le_i64_e64 s0, 0x64, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_icmp_i64_sge:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_le_i64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_icmp_i64_sge:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_le_i64_e64 s0, 0x64, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_icmp_i64_sge:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_le_i64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_le_i64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 39)
store i32 %result, ptr addrspace(1) %out
@@ -1032,47 +1032,47 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) {
; SDAG-GFX11-LABEL: v_icmp_i64_slt:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0x64, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_icmp_i64_slt:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_gt_i64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_icmp_i64_slt:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0x64, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_icmp_i64_slt:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_gt_i64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_gt_i64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 40)
store i32 %result, ptr addrspace(1) %out
@@ -1082,47 +1082,47 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) {
; SDAG-GFX11-LABEL: v_icmp_i64_sle:
; SDAG-GFX11: ; %bb.0:
-; SDAG-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3]
+; SDAG-GFX11-NEXT: v_cmp_ge_i64_e64 s0, 0x64, s[6:7]
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; SDAG-GFX10-LABEL: v_icmp_i64_sle:
; SDAG-GFX10: ; %bb.0:
-; SDAG-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-GFX10-NEXT: v_mov_b32_e32 v0, 0
; SDAG-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3]
-; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s2
-; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; SDAG-GFX10-NEXT: v_cmp_ge_i64_e64 s0, 0x64, s[6:7]
+; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, s0
+; SDAG-GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; SDAG-GFX10-NEXT: s_endpgm
;
; GISEL-GFX11-LABEL: v_icmp_i64_sle:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3]
+; GISEL-GFX11-NEXT: v_cmp_ge_i64_e64 s0, 0x64, s[6:7]
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_icmp_i64_sle:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: v_cmp_ge_i64_e64 s2, 0x64, s[2:3]
-; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GISEL-GFX10-NEXT: v_cmp_ge_i64_e64 s0, 0x64, s[6:7]
+; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 41)
store i32 %result, ptr addrspace(1) %out
@@ -1133,13 +1133,13 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) {
; SDAG-GFX11-LABEL: v_icmp_i16_eq:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_eq_u16_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1159,14 +1159,14 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) {
; GISEL-GFX11-LABEL: v_icmp_i16_eq:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_eq_u16_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_eq_u16_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -1198,20 +1198,20 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) {
;
; GISEL-GFX11-LABEL: v_icmp_i16:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[0:1]
+; GISEL-GFX11-NEXT: global_store_b32 v0, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-GFX10-LABEL: v_icmp_i16:
; GISEL-GFX10: ; %bb.0:
-; GISEL-GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-GFX10-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[0:1]
+; GISEL-GFX10-NEXT: global_store_dword v0, v0, s[2:3]
; GISEL-GFX10-NEXT: s_endpgm
%result = call i32 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 30)
store i32 %result, ptr addrspace(1) %out
@@ -1222,13 +1222,13 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) {
; SDAG-GFX11-LABEL: v_icmp_i16_ne:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_ne_u16_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1248,14 +1248,14 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) {
; GISEL-GFX11-LABEL: v_icmp_i16_ne:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ne_u16_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_ne_u16_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -1280,13 +1280,13 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) {
; SDAG-GFX11-LABEL: v_icmp_i16_ugt:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_lt_u16_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1306,14 +1306,14 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) {
; GISEL-GFX11-LABEL: v_icmp_i16_ugt:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_lt_u16_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_lt_u16_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -1338,13 +1338,13 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) {
; SDAG-GFX11-LABEL: v_icmp_i16_uge:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_le_u16_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_le_u16_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1364,14 +1364,14 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) {
; GISEL-GFX11-LABEL: v_icmp_i16_uge:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_le_u16_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_le_u16_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -1396,13 +1396,13 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) {
; SDAG-GFX11-LABEL: v_icmp_i16_ult:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_gt_u16_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1422,14 +1422,14 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) {
; GISEL-GFX11-LABEL: v_icmp_i16_ult:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_gt_u16_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_gt_u16_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -1454,13 +1454,13 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) {
; SDAG-GFX11-LABEL: v_icmp_i16_ule:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_ge_u16_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1480,14 +1480,14 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) {
; GISEL-GFX11-LABEL: v_icmp_i16_ule:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ge_u16_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_ge_u16_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -1512,13 +1512,13 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 {
; SDAG-GFX11-LABEL: v_icmp_i16_sgt:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_lt_i16_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1538,14 +1538,14 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 {
; GISEL-GFX11-LABEL: v_icmp_i16_sgt:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_lt_i16_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_lt_i16_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -1570,13 +1570,13 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) {
; SDAG-GFX11-LABEL: v_icmp_i16_sge:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_le_i16_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_le_i16_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1596,14 +1596,14 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) {
; GISEL-GFX11-LABEL: v_icmp_i16_sge:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_le_i16_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_le_i16_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -1628,13 +1628,13 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) {
; SDAG-GFX11-LABEL: v_icmp_i16_slt:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_gt_i16_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1654,14 +1654,14 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) {
; GISEL-GFX11-LABEL: v_icmp_i16_slt:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_gt_i16_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_gt_i16_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
@@ -1686,13 +1686,13 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) {
; SDAG-GFX11-LABEL: v_icmp_i16_sle:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s2
+; SDAG-GFX11-NEXT: v_cmp_ge_i16_e64 s0, 0x64, s4
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; SDAG-GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
@@ -1712,14 +1712,14 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) {
; GISEL-GFX11-LABEL: v_icmp_i16_sle:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: v_cmp_ge_i16_e64 s2, 0x64, s2
+; GISEL-GFX11-NEXT: v_cmp_ge_i16_e64 s0, 0x64, s4
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
index 54931ac345130..ae285c8993b42 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
@@ -25,30 +25,30 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) {
; GFX11-LABEL: v_icmp_i32_eq:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_eq_u32_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_eq_u32_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i32_eq:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_eq_u32_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -67,15 +67,15 @@ define amdgpu_kernel void @v_icmp_i32_eq(ptr addrspace(1) %out, i32 %src) {
;
; GISEL-VI-LABEL: v_icmp_i32_eq:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_eq_u32_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 32)
@@ -98,29 +98,29 @@ define amdgpu_kernel void @v_icmp_i32(ptr addrspace(1) %out, i32 %src) {
;
; GISEL-GFX11-LABEL: v_icmp_i32:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GISEL-GFX11-NEXT: global_store_b64 v0, v[0:1], s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-VI-LABEL: v_icmp_i32:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; GISEL-VI-NEXT: s_endpgm
;
; GISEL-GFX9-LABEL: v_icmp_i32:
; GISEL-GFX9: ; %bb.0:
-; GISEL-GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1]
+; GISEL-GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[2:3]
; GISEL-GFX9-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 30)
store i64 %result, ptr addrspace(1) %out
@@ -131,30 +131,30 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) {
; GFX11-LABEL: v_icmp_i32_ne:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u32_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_ne_u32_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i32_ne:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_ne_u32_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_ne_u32_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -173,15 +173,15 @@ define amdgpu_kernel void @v_icmp_i32_ne(ptr addrspace(1) %out, i32 %src) {
;
; GISEL-VI-LABEL: v_icmp_i32_ne:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_ne_u32_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_ne_u32_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 33)
@@ -193,30 +193,30 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) {
; GFX11-LABEL: v_icmp_i32_ugt:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_lt_u32_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_lt_u32_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i32_ugt:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_gt_u32_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_gt_u32_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -235,15 +235,15 @@ define amdgpu_kernel void @v_icmp_i32_ugt(ptr addrspace(1) %out, i32 %src) {
;
; GISEL-VI-LABEL: v_icmp_i32_ugt:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_gt_u32_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_gt_u32_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 34)
@@ -255,30 +255,30 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) {
; GFX11-LABEL: v_icmp_i32_uge:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_le_u32_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_le_u32_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i32_uge:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_ge_u32_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_ge_u32_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -297,15 +297,15 @@ define amdgpu_kernel void @v_icmp_i32_uge(ptr addrspace(1) %out, i32 %src) {
;
; GISEL-VI-LABEL: v_icmp_i32_uge:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_ge_u32_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_ge_u32_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 35)
@@ -317,30 +317,30 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) {
; GFX11-LABEL: v_icmp_i32_ult:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_u32_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_gt_u32_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i32_ult:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_lt_u32_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_lt_u32_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -359,15 +359,15 @@ define amdgpu_kernel void @v_icmp_i32_ult(ptr addrspace(1) %out, i32 %src) {
;
; GISEL-VI-LABEL: v_icmp_i32_ult:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_lt_u32_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_lt_u32_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 36)
@@ -379,30 +379,30 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) {
; GFX11-LABEL: v_icmp_i32_ule:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ge_u32_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_ge_u32_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i32_ule:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_le_u32_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -421,15 +421,15 @@ define amdgpu_kernel void @v_icmp_i32_ule(ptr addrspace(1) %out, i32 %src) {
;
; GISEL-VI-LABEL: v_icmp_i32_ule:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_le_u32_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 37)
@@ -441,30 +441,30 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 {
; GFX11-LABEL: v_icmp_i32_sgt:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_lt_i32_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_lt_i32_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i32_sgt:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_gt_i32_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -483,15 +483,15 @@ define amdgpu_kernel void @v_icmp_i32_sgt(ptr addrspace(1) %out, i32 %src) #1 {
;
; GISEL-VI-LABEL: v_icmp_i32_sgt:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_gt_i32_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_gt_i32_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 38)
@@ -503,30 +503,30 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) {
; GFX11-LABEL: v_icmp_i32_sge:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_le_i32_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_le_i32_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i32_sge:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_ge_i32_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_ge_i32_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -545,15 +545,15 @@ define amdgpu_kernel void @v_icmp_i32_sge(ptr addrspace(1) %out, i32 %src) {
;
; GISEL-VI-LABEL: v_icmp_i32_sge:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_ge_i32_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_ge_i32_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 39)
@@ -565,30 +565,30 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) {
; GFX11-LABEL: v_icmp_i32_slt:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_i32_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_gt_i32_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i32_slt:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_lt_i32_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -607,15 +607,15 @@ define amdgpu_kernel void @v_icmp_i32_slt(ptr addrspace(1) %out, i32 %src) {
;
; GISEL-VI-LABEL: v_icmp_i32_slt:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_lt_i32_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_lt_i32_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 40)
@@ -627,30 +627,30 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) {
; GFX11-LABEL: v_icmp_i32_sle:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ge_i32_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_ge_i32_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i32_sle:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_le_i32_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_le_i32_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -669,15 +669,15 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) {
;
; GISEL-VI-LABEL: v_icmp_i32_sle:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_le_i32_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_le_i32_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 41)
@@ -688,56 +688,56 @@ define amdgpu_kernel void @v_icmp_i32_sle(ptr addrspace(1) %out, i32 %src) {
define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) {
; GFX11-LABEL: v_icmp_i64_eq:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_eq_u64_e64 s[2:3], 0x64, s[2:3]
+; GFX11-NEXT: v_cmp_eq_u64_e64 s[0:1], 0x64, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i64_eq:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[0:1]
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s4
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s5
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_icmp_i64_eq:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x64
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GISEL-VI-LABEL: v_icmp_i64_eq:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[0:1]
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s4
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s5
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 32)
@@ -748,56 +748,56 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) {
; GFX11-LABEL: v_icmp_i64_ne:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u64_e64 s[2:3], 0x64, s[2:3]
+; GFX11-NEXT: v_cmp_ne_u64_e64 s[0:1], 0x64, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i64_ne:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_ne_u64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_cmp_ne_u64_e64 s[0:1], s[6:7], v[0:1]
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s4
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s5
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_icmp_i64_ne:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x64
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ne_u64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_ne_u64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GISEL-VI-LABEL: v_icmp_i64_ne:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_ne_u64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_ne_u64_e64 s[0:1], s[6:7], v[0:1]
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s4
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s5
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 33)
@@ -808,56 +808,56 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) {
; GFX11-LABEL: v_icmp_u64_ugt:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_lt_u64_e64 s[2:3], 0x64, s[2:3]
+; GFX11-NEXT: v_cmp_lt_u64_e64 s[0:1], 0x64, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_u64_ugt:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_gt_u64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_cmp_gt_u64_e64 s[0:1], s[6:7], v[0:1]
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s4
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s5
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_icmp_u64_ugt:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x64
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_gt_u64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_gt_u64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GISEL-VI-LABEL: v_icmp_u64_ugt:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_gt_u64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_gt_u64_e64 s[0:1], s[6:7], v[0:1]
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s4
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s5
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 34)
@@ -868,56 +868,56 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) {
; GFX11-LABEL: v_icmp_u64_uge:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_le_u64_e64 s[2:3], 0x64, s[2:3]
+; GFX11-NEXT: v_cmp_le_u64_e64 s[0:1], 0x64, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_u64_uge:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_ge_u64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_cmp_ge_u64_e64 s[0:1], s[6:7], v[0:1]
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s4
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s5
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_icmp_u64_uge:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x64
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ge_u64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_ge_u64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GISEL-VI-LABEL: v_icmp_u64_uge:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_ge_u64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_ge_u64_e64 s[0:1], s[6:7], v[0:1]
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s4
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s5
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 35)
@@ -928,56 +928,56 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) {
; GFX11-LABEL: v_icmp_u64_ult:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_u64_e64 s[2:3], 0x64, s[2:3]
+; GFX11-NEXT: v_cmp_gt_u64_e64 s[0:1], 0x64, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_u64_ult:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_lt_u64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1]
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s4
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s5
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_icmp_u64_ult:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x64
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_lt_u64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GISEL-VI-LABEL: v_icmp_u64_ult:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_lt_u64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_lt_u64_e64 s[0:1], s[6:7], v[0:1]
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s4
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s5
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 36)
@@ -988,56 +988,56 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) {
; GFX11-LABEL: v_icmp_u64_ule:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ge_u64_e64 s[2:3], 0x64, s[2:3]
+; GFX11-NEXT: v_cmp_ge_u64_e64 s[0:1], 0x64, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_u64_ule:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_le_u64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_cmp_le_u64_e64 s[0:1], s[6:7], v[0:1]
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s4
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s5
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_icmp_u64_ule:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x64
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_le_u64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_le_u64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GISEL-VI-LABEL: v_icmp_u64_ule:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_le_u64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_le_u64_e64 s[0:1], s[6:7], v[0:1]
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s4
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s5
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 37)
@@ -1048,56 +1048,56 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) {
; GFX11-LABEL: v_icmp_i64_sgt:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_lt_i64_e64 s[2:3], 0x64, s[2:3]
+; GFX11-NEXT: v_cmp_lt_i64_e64 s[0:1], 0x64, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i64_sgt:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], v[0:1]
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s4
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s5
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_icmp_i64_sgt:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x64
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GISEL-VI-LABEL: v_icmp_i64_sgt:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], v[0:1]
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s4
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s5
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 38)
@@ -1108,56 +1108,56 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) {
; GFX11-LABEL: v_icmp_i64_sge:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_le_i64_e64 s[2:3], 0x64, s[2:3]
+; GFX11-NEXT: v_cmp_le_i64_e64 s[0:1], 0x64, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i64_sge:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_ge_i64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_cmp_ge_i64_e64 s[0:1], s[6:7], v[0:1]
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s4
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s5
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_icmp_i64_sge:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x64
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_ge_i64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_ge_i64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GISEL-VI-LABEL: v_icmp_i64_sge:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_ge_i64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_ge_i64_e64 s[0:1], s[6:7], v[0:1]
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s4
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s5
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 39)
@@ -1168,56 +1168,56 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) {
; GFX11-LABEL: v_icmp_i64_slt:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_i64_e64 s[2:3], 0x64, s[2:3]
+; GFX11-NEXT: v_cmp_gt_i64_e64 s[0:1], 0x64, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i64_slt:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], v[0:1]
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s4
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s5
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_icmp_i64_slt:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x64
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GISEL-VI-LABEL: v_icmp_i64_slt:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_lt_i64_e64 s[0:1], s[6:7], v[0:1]
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s4
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s5
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 40)
@@ -1228,56 +1228,56 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) {
define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) {
; GFX11-LABEL: v_icmp_i64_sle:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ge_i64_e64 s[2:3], 0x64, s[2:3]
+; GFX11-NEXT: v_cmp_ge_i64_e64 s[0:1], 0x64, s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i64_sle:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_le_i64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_cmp_le_i64_e64 s[0:1], s[6:7], v[0:1]
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s4
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s5
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
; SDAG-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; SDAG-VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_icmp_i64_sle:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x64
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_cmp_le_i64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: v_cmp_le_i64_e64 s[0:1], s[6:7], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GISEL-VI-LABEL: v_icmp_i64_sle:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_le_i64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_le_i64_e64 s[0:1], s[6:7], v[0:1]
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s4
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s5
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 41)
@@ -1289,30 +1289,30 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) {
; GFX11-LABEL: v_icmp_i16_eq:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_eq_u16_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_eq_u16_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i16_eq:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_eq_u16_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_eq_u16_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -1331,15 +1331,15 @@ define amdgpu_kernel void @v_icmp_i16_eq(ptr addrspace(1) %out, i16 %src) {
;
; GISEL-VI-LABEL: v_icmp_i16_eq:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_eq_u16_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_eq_u16_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 32)
@@ -1362,29 +1362,29 @@ define amdgpu_kernel void @v_icmp_i16(ptr addrspace(1) %out, i16 %src) {
;
; GISEL-GFX11-LABEL: v_icmp_i16:
; GISEL-GFX11: ; %bb.0:
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: global_store_b64 v0, v[0:1], s[0:1]
+; GISEL-GFX11-NEXT: global_store_b64 v0, v[0:1], s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
;
; GISEL-VI-LABEL: v_icmp_i16:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; GISEL-VI-NEXT: s_endpgm
;
; GISEL-GFX9-LABEL: v_icmp_i16:
; GISEL-GFX9: ; %bb.0:
-; GISEL-GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-GFX9-NEXT: v_mov_b32_e32 v0, 0
; GISEL-GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[0:1]
+; GISEL-GFX9-NEXT: global_store_dwordx2 v0, v[0:1], s[2:3]
; GISEL-GFX9-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 30)
store i64 %result, ptr addrspace(1) %out
@@ -1395,30 +1395,30 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) {
; GFX11-LABEL: v_icmp_i16_ne:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ne_u16_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_ne_u16_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i16_ne:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_ne_u16_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_ne_u16_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -1437,15 +1437,15 @@ define amdgpu_kernel void @v_icmp_i16_ne(ptr addrspace(1) %out, i16 %src) {
;
; GISEL-VI-LABEL: v_icmp_i16_ne:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_ne_u16_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_ne_u16_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 33)
@@ -1457,30 +1457,30 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) {
; GFX11-LABEL: v_icmp_i16_ugt:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_lt_u16_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_lt_u16_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i16_ugt:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_gt_u16_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_gt_u16_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -1499,15 +1499,15 @@ define amdgpu_kernel void @v_icmp_i16_ugt(ptr addrspace(1) %out, i16 %src) {
;
; GISEL-VI-LABEL: v_icmp_i16_ugt:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_gt_u16_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_gt_u16_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 34)
@@ -1519,30 +1519,30 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) {
; GFX11-LABEL: v_icmp_i16_uge:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_le_u16_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_le_u16_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i16_uge:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_ge_u16_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_ge_u16_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -1561,15 +1561,15 @@ define amdgpu_kernel void @v_icmp_i16_uge(ptr addrspace(1) %out, i16 %src) {
;
; GISEL-VI-LABEL: v_icmp_i16_uge:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_ge_u16_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_ge_u16_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 35)
@@ -1581,30 +1581,30 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) {
; GFX11-LABEL: v_icmp_i16_ult:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_u16_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_gt_u16_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i16_ult:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_lt_u16_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_lt_u16_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -1623,15 +1623,15 @@ define amdgpu_kernel void @v_icmp_i16_ult(ptr addrspace(1) %out, i16 %src) {
;
; GISEL-VI-LABEL: v_icmp_i16_ult:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_lt_u16_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_lt_u16_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 36)
@@ -1643,30 +1643,30 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) {
; GFX11-LABEL: v_icmp_i16_ule:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ge_u16_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_ge_u16_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i16_ule:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_le_u16_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_le_u16_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -1685,15 +1685,15 @@ define amdgpu_kernel void @v_icmp_i16_ule(ptr addrspace(1) %out, i16 %src) {
;
; GISEL-VI-LABEL: v_icmp_i16_ule:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_le_u16_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_le_u16_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 37)
@@ -1705,30 +1705,30 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 {
; GFX11-LABEL: v_icmp_i16_sgt:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_lt_i16_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_lt_i16_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i16_sgt:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_gt_i16_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_gt_i16_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -1747,15 +1747,15 @@ define amdgpu_kernel void @v_icmp_i16_sgt(ptr addrspace(1) %out, i16 %src) #1 {
;
; GISEL-VI-LABEL: v_icmp_i16_sgt:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_gt_i16_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_gt_i16_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 38)
@@ -1767,30 +1767,30 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) {
; GFX11-LABEL: v_icmp_i16_sge:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_le_i16_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_le_i16_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i16_sge:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_ge_i16_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_ge_i16_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -1809,15 +1809,15 @@ define amdgpu_kernel void @v_icmp_i16_sge(ptr addrspace(1) %out, i16 %src) {
;
; GISEL-VI-LABEL: v_icmp_i16_sge:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_ge_i16_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_ge_i16_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 39)
@@ -1829,30 +1829,30 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) {
; GFX11-LABEL: v_icmp_i16_slt:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_gt_i16_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_gt_i16_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i16_slt:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_lt_i16_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_lt_i16_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -1871,15 +1871,15 @@ define amdgpu_kernel void @v_icmp_i16_slt(ptr addrspace(1) %out, i16 %src) {
;
; GISEL-VI-LABEL: v_icmp_i16_slt:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_lt_i16_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_lt_i16_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 40)
@@ -1891,30 +1891,30 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) {
; GFX11-LABEL: v_icmp_i16_sle:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_ge_i16_e64 s[2:3], 0x64, s2
+; GFX11-NEXT: v_cmp_ge_i16_e64 s[0:1], 0x64, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: v_mov_b32_e32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; SDAG-VI-LABEL: v_icmp_i16_sle:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0x64
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: v_cmp_le_i16_e64 s[2:3], s2, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v2, s2
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
-; SDAG-VI-NEXT: v_mov_b32_e32 v3, s3
+; SDAG-VI-NEXT: v_cmp_le_i16_e64 s[0:1], s4, v0
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v3, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, s0
; SDAG-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; SDAG-VI-NEXT: s_endpgm
;
@@ -1933,15 +1933,15 @@ define amdgpu_kernel void @v_icmp_i16_sle(ptr addrspace(1) %out, i16 %src) {
;
; GISEL-VI-LABEL: v_icmp_i16_sle:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GISEL-VI-NEXT: v_mov_b32_e32 v0, 0x64
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: v_cmp_le_i16_e64 s[2:3], s2, v0
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v3, s1
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_cmp_le_i16_e64 s[0:1], s4, v0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: v_mov_b32_e32 v3, s3
; GISEL-VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GISEL-VI-NEXT: s_endpgm
%result = call i64 @llvm.amdgcn.icmp.i16(i16 %src, i16 100, i32 41)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
index 3a77b3bc9cb95..cffd9a6eee25c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll
@@ -349,9 +349,10 @@ main_body:
define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) {
; GFX1013-LABEL: image_bvh64_intersect_ray_nsa_reassign:
; GFX1013: ; %bb.0: ; %main_body
-; GFX1013-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1013-NEXT: s_clause 0x1
+; GFX1013-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
; GFX1013-NEXT: v_mov_b32_e32 v3, 0
; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0
; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0
@@ -362,22 +363,23 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
; GFX1013-NEXT: v_mov_b32_e32 v10, 0x40e00000
; GFX1013-NEXT: v_mov_b32_e32 v11, 0x41000000
; GFX1013-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1013-NEXT: v_add_co_u32 v0, s4, s4, v0
-; GFX1013-NEXT: v_add_co_ci_u32_e64 v1, s4, s5, 0, s4
+; GFX1013-NEXT: v_add_co_u32 v0, s0, s2, v0
+; GFX1013-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
; GFX1013-NEXT: flat_load_dword v2, v[0:1]
; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c7
; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102
; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3]
+; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[4:7]
; GFX1013-NEXT: s_waitcnt vmcnt(0)
; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1013-NEXT: s_endpgm
;
; GFX1030-LABEL: image_bvh64_intersect_ray_nsa_reassign:
; GFX1030: ; %bb.0: ; %main_body
-; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1030-NEXT: s_clause 0x1
+; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
; GFX1030-NEXT: v_mov_b32_e32 v3, 0
; GFX1030-NEXT: v_mov_b32_e32 v11, 0x41000000
; GFX1030-NEXT: v_mov_b32_e32 v10, 0x40e00000
@@ -388,36 +390,37 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_nsa_reassign(ptr %p_ray, <4
; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0
; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT: v_add_co_u32 v0, s4, s4, v0
-; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4
+; GFX1030-NEXT: v_add_co_u32 v0, s0, s2, v0
+; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
; GFX1030-NEXT: flat_load_dword v2, v[0:1]
; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102
; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c7
; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[0:3]
+; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:11], s[4:7]
; GFX1030-NEXT: s_waitcnt vmcnt(0)
; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1030-NEXT: s_endpgm
;
; GFX11-LABEL: image_bvh64_intersect_ray_nsa_reassign:
; GFX11: ; %bb.0: ; %main_body
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v2, 0x41000000
; GFX11-NEXT: v_dual_mov_b32 v3, 0x40400000 :: v_dual_mov_b32 v4, 4.0
; GFX11-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_mov_b32 v6, 0
; GFX11-NEXT: v_dual_mov_b32 v8, 2.0 :: v_dual_mov_b32 v9, 0xb36211c7
; GFX11-NEXT: v_dual_mov_b32 v10, 0x102 :: v_dual_mov_b32 v7, 1.0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v0
+; GFX11-NEXT: v_add_co_u32 v0, s0, s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
; GFX11-NEXT: flat_load_b32 v11, v[0:1]
; GFX11-NEXT: v_mov_b32_e32 v0, 0x40c00000
; GFX11-NEXT: v_mov_b32_e32 v1, 0x40e00000
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[6:8], v[3:5], v[0:2]], s[0:3]
+; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[6:8], v[3:5], v[0:2]], s[4:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3]
; GFX11-NEXT: s_endpgm
@@ -442,9 +445,10 @@ main_body:
define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray, <4 x i32> inreg %tdescr) {
; GFX1013-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
; GFX1013: ; %bb.0: ; %main_body
-; GFX1013-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1013-NEXT: s_clause 0x1
+; GFX1013-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1013-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; GFX1013-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX1013-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
; GFX1013-NEXT: v_mov_b32_e32 v3, 0
; GFX1013-NEXT: v_mov_b32_e32 v4, 1.0
; GFX1013-NEXT: v_mov_b32_e32 v5, 2.0
@@ -452,22 +456,23 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray
; GFX1013-NEXT: v_mov_b32_e32 v7, 0x46004500
; GFX1013-NEXT: v_mov_b32_e32 v8, 0x48004700
; GFX1013-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1013-NEXT: v_add_co_u32 v0, s4, s4, v0
-; GFX1013-NEXT: v_add_co_ci_u32_e64 v1, s4, s5, 0, s4
+; GFX1013-NEXT: v_add_co_u32 v0, s0, s2, v0
+; GFX1013-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
; GFX1013-NEXT: flat_load_dword v2, v[0:1]
; GFX1013-NEXT: v_mov_b32_e32 v0, 0xb36211c6
; GFX1013-NEXT: v_mov_b32_e32 v1, 0x102
; GFX1013-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16
+; GFX1013-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[4:7] a16
; GFX1013-NEXT: s_waitcnt vmcnt(0)
; GFX1013-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1013-NEXT: s_endpgm
;
; GFX1030-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
; GFX1030: ; %bb.0: ; %main_body
-; GFX1030-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX1030-NEXT: s_clause 0x1
+; GFX1030-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1030-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; GFX1030-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX1030-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34
; GFX1030-NEXT: v_mov_b32_e32 v3, 0
; GFX1030-NEXT: v_mov_b32_e32 v5, 2.0
; GFX1030-NEXT: v_mov_b32_e32 v4, 1.0
@@ -475,34 +480,35 @@ define amdgpu_kernel void @image_bvh64_intersect_ray_a16_nsa_reassign(ptr %p_ray
; GFX1030-NEXT: v_mov_b32_e32 v7, 0x46004500
; GFX1030-NEXT: v_mov_b32_e32 v8, 0x48004700
; GFX1030-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1030-NEXT: v_add_co_u32 v0, s4, s4, v0
-; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4
+; GFX1030-NEXT: v_add_co_u32 v0, s0, s2, v0
+; GFX1030-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
; GFX1030-NEXT: flat_load_dword v2, v[0:1]
; GFX1030-NEXT: v_mov_b32_e32 v1, 0x102
; GFX1030-NEXT: v_mov_b32_e32 v0, 0xb36211c6
; GFX1030-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16
+; GFX1030-NEXT: image_bvh64_intersect_ray v[0:3], v[0:8], s[4:7] a16
; GFX1030-NEXT: s_waitcnt vmcnt(0)
; GFX1030-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; GFX1030-NEXT: s_endpgm
;
; GFX11-LABEL: image_bvh64_intersect_ray_a16_nsa_reassign:
; GFX11: ; %bb.0: ; %main_body
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34
; GFX11-NEXT: v_dual_mov_b32 v2, 0x48004500 :: v_dual_mov_b32 v5, 2.0
; GFX11-NEXT: v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v7, 0x102
; GFX11-NEXT: v_dual_mov_b32 v6, 0xb36211c6 :: v_dual_mov_b32 v3, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v0
+; GFX11-NEXT: v_add_co_u32 v0, s0, s2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
; GFX11-NEXT: flat_load_b32 v8, v[0:1]
; GFX11-NEXT: v_mov_b32_e32 v0, 0x46004200
; GFX11-NEXT: v_mov_b32_e32 v1, 0x47004400
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[3:5], v[0:2]], s[0:3] a16
+; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[3:5], v[0:2]], s[4:7] a16
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: flat_store_b128 v[0:1], v[0:3]
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
index 634159aec9db5..c9bdc70da1bd3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
@@ -72,13 +72,13 @@ define amdgpu_kernel void @v_permlane16_b32_vii(ptr addrspace(1) %out, i32 %src0
; GFX11-LABEL: v_permlane16_b32_vii:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_permlane16_b32 v0, v0, 1, 2
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -117,14 +117,14 @@ define amdgpu_kernel void @v_permlane16_b32_vll(ptr addrspace(1) %out, i32 %src0
; GFX11-LABEL: v_permlane16_b32_vll:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_movk_i32 s0, 0x1234
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT: s_movk_i32 s2, 0x1234
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, 0xc1d1
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_permlane16_b32 v0, v0, s0, 0xc1d1
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -165,19 +165,19 @@ define amdgpu_kernel void @v_permlane16_b32_vvv(ptr addrspace(1) %out, i32 %src0
; GFX11-SDAG-LABEL: v_permlane16_b32_vvv:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_clause 0x1
-; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0
; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1
-; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0
; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s3, s2
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s4
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v1, s0, s1
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
@@ -185,18 +185,18 @@ define amdgpu_kernel void @v_permlane16_b32_vvv(ptr addrspace(1) %out, i32 %src0
; GFX11-GISEL-LABEL: v_permlane16_b32_vvv:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_clause 0x1
-; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0
; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v1
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s3, s4
-; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: v_permlane16_b32 v0, v0, s0, s1
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -596,13 +596,13 @@ define amdgpu_kernel void @v_permlanex16_b32_vii(ptr addrspace(1) %out, i32 %src
; GFX11-LABEL: v_permlanex16_b32_vii:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_permlanex16_b32 v0, v0, 1, 2
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -641,14 +641,14 @@ define amdgpu_kernel void @v_permlanex16_b32_vll(ptr addrspace(1) %out, i32 %src
; GFX11-LABEL: v_permlanex16_b32_vll:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_movk_i32 s0, 0x1234
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT: s_movk_i32 s2, 0x1234
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, 0xc1d1
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_permlanex16_b32 v0, v0, s0, 0xc1d1
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -689,19 +689,19 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src
; GFX11-SDAG-LABEL: v_permlanex16_b32_vvv:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_clause 0x1
-; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x3ff, v0
; GFX11-SDAG-NEXT: v_bfe_u32 v0, v0, 10, 10
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s3, v1
-; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2
-; GFX11-SDAG-NEXT: v_readfirstlane_b32 s2, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s1, v0
; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s3, s2
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: v_readfirstlane_b32 s0, v1
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s4
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v1, s0, s1
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
@@ -709,18 +709,18 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv(ptr addrspace(1) %out, i32 %src
; GFX11-GISEL-LABEL: v_permlanex16_b32_vvv:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_clause 0x1
-; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x3ff, v0
; GFX11-GISEL-NEXT: v_bfe_u32 v0, v0, 10, 10
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s1, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-GISEL-NEXT: v_readfirstlane_b32 s3, v1
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX11-GISEL-NEXT: v_readfirstlane_b32 s0, v1
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s3, s4
-; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: v_permlanex16_b32 v0, v0, s0, s1
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -1075,11 +1075,11 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid(ptr addrspace(1) %out, i32 %
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1088,11 +1088,11 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid(ptr addrspace(1) %out, i32 %
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1118,11 +1118,11 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid(ptr addrspace(1) %out, i32
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1131,11 +1131,11 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid(ptr addrspace(1) %out, i32
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1175,12 +1175,12 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid(ptr addrspace(1) %out, i32 %sr
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3
-; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
@@ -1189,13 +1189,13 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid(ptr addrspace(1) %out, i32 %sr
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3
; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -1204,12 +1204,12 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid(ptr addrspace(1) %out, i32 %sr
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlane16_b32 v1, v0, s2, s3
-; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[4:5]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -1218,13 +1218,13 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid(ptr addrspace(1) %out, i32 %sr
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlane16_b32 v1, v0, s2, s3
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -1250,11 +1250,11 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi(ptr addrspace(1) %out, i32
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1263,11 +1263,11 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi(ptr addrspace(1) %out, i32
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,0]
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1294,11 +1294,11 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc(ptr addrspace(1) %out, i32
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1307,11 +1307,11 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc(ptr addrspace(1) %out, i32
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[0,1]
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1338,11 +1338,11 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc(ptr addrspace(1) %out, i
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1351,11 +1351,11 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc(ptr addrspace(1) %out, i
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_permlane16_b32 v0, v0, s2, s3 op_sel:[1,1]
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1382,11 +1382,11 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid(ptr addrspace(1) %out, i32
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1395,11 +1395,11 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid(ptr addrspace(1) %out, i32
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1425,11 +1425,11 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid(ptr addrspace(1) %out, i3
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1438,11 +1438,11 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid(ptr addrspace(1) %out, i3
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1482,12 +1482,12 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid(ptr addrspace(1) %out, i32 %s
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_clause 0x1
; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3
-; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v2, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
@@ -1496,13 +1496,13 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid(ptr addrspace(1) %out, i32 %s
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_clause 0x1
; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3
; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -1511,12 +1511,12 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid(ptr addrspace(1) %out, i32 %s
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlanex16_b32 v1, v0, s2, s3
-; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v2, v1, s[4:5]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -1525,13 +1525,13 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid(ptr addrspace(1) %out, i32 %s
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0x3039
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlanex16_b32 v1, v0, s2, s3
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -1557,11 +1557,11 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi(ptr addrspace(1) %out, i32
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1570,11 +1570,11 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi(ptr addrspace(1) %out, i32
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,0]
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1601,11 +1601,11 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc(ptr addrspace(1) %out, i32
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1614,11 +1614,11 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc(ptr addrspace(1) %out, i32
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[0,1]
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1645,11 +1645,11 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc(ptr addrspace(1) %out,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1658,11 +1658,11 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc(ptr addrspace(1) %out,
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x30
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_permlanex16_b32 v0, v0, s2, s3 op_sel:[1,1]
-; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll
index 77a975f7abe0f..2cc49c34a628a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.var.ll
@@ -445,13 +445,13 @@ define amdgpu_kernel void @v_permlane16var_b32_tid_tid(ptr addrspace(1) %out, i3
; GFX12-SDAG-LABEL: v_permlane16var_b32_tid_tid:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1
-; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -459,14 +459,14 @@ define amdgpu_kernel void @v_permlane16var_b32_tid_tid(ptr addrspace(1) %out, i3
; GFX12-GISEL-LABEL: v_permlane16var_b32_tid_tid:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -480,13 +480,13 @@ define amdgpu_kernel void @v_permlane16var_b32_undef_tid(ptr addrspace(1) %out,
; GFX12-SDAG-LABEL: v_permlane16var_b32_undef_tid:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1
-; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -494,14 +494,14 @@ define amdgpu_kernel void @v_permlane16var_b32_undef_tid(ptr addrspace(1) %out,
; GFX12-GISEL-LABEL: v_permlane16var_b32_undef_tid:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -516,14 +516,14 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid(ptr addrspace(1) %out, i32
; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, s4
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlane16_var_b32 v1, v0, v2
-; GFX12-SDAG-NEXT: global_store_b32 v3, v1, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v3, v1, s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -531,14 +531,14 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid(ptr addrspace(1) %out, i32
; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s2
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlane16_var_b32 v1, v0, v2
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -552,13 +552,13 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi(ptr addrspace(1) %out, i
; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid_fi:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0]
-; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -566,14 +566,14 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi(ptr addrspace(1) %out, i
; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid_fi:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,0]
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -588,13 +588,13 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_bc(ptr addrspace(1) %out, i
; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid_bc:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1]
-; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -602,14 +602,14 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_bc(ptr addrspace(1) %out, i
; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid_bc:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[0,1]
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -624,13 +624,13 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi_bc(ptr addrspace(1) %out
; GFX12-SDAG-LABEL: v_permlane16var_b32_i_tid_fi_bc:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1]
-; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -638,14 +638,14 @@ define amdgpu_kernel void @v_permlane16var_b32_i_tid_fi_bc(ptr addrspace(1) %out
; GFX12-GISEL-LABEL: v_permlane16var_b32_i_tid_fi_bc:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlane16_var_b32 v0, v0, v1 op_sel:[1,1]
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -660,13 +660,13 @@ define amdgpu_kernel void @v_permlanex16var_b32_tid_tid(ptr addrspace(1) %out, i
; GFX12-SDAG-LABEL: v_permlanex16var_b32_tid_tid:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1
-; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -674,14 +674,14 @@ define amdgpu_kernel void @v_permlanex16var_b32_tid_tid(ptr addrspace(1) %out, i
; GFX12-GISEL-LABEL: v_permlanex16var_b32_tid_tid:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -695,13 +695,13 @@ define amdgpu_kernel void @v_permlanex16var_b32_undef_tid(ptr addrspace(1) %out,
; GFX12-SDAG-LABEL: v_permlanex16var_b32_undef_tid:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1
-; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -709,14 +709,14 @@ define amdgpu_kernel void @v_permlanex16var_b32_undef_tid(ptr addrspace(1) %out,
; GFX12-GISEL-LABEL: v_permlanex16var_b32_undef_tid:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -731,14 +731,14 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid(ptr addrspace(1) %out, i32
; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0x3039
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v2, s4
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v1, v0, v2
-; GFX12-SDAG-NEXT: global_store_b32 v3, v1, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v3, v1, s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -746,14 +746,14 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid(ptr addrspace(1) %out, i32
; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s2
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, s4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v1, v0, v2
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -767,13 +767,13 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi(ptr addrspace(1) %out,
; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid_fi:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0]
-; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -781,14 +781,14 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi(ptr addrspace(1) %out,
; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid_fi:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,0]
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -803,13 +803,13 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_bc(ptr addrspace(1) %out,
; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid_bc:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1]
-; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -817,14 +817,14 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_bc(ptr addrspace(1) %out,
; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid_bc:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[0,1]
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
@@ -839,13 +839,13 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi_bc(ptr addrspace(1) %ou
; GFX12-SDAG-LABEL: v_permlanex16var_b32_i_tid_fi_bc:
; GFX12-SDAG: ; %bb.0:
; GFX12-SDAG-NEXT: s_clause 0x1
-; GFX12-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s4
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-SDAG-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1]
-; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
@@ -853,14 +853,14 @@ define amdgpu_kernel void @v_permlanex16var_b32_i_tid_fi_bc(ptr addrspace(1) %ou
; GFX12-GISEL-LABEL: v_permlanex16var_b32_i_tid_fi_bc:
; GFX12-GISEL: ; %bb.0:
; GFX12-GISEL-NEXT: s_clause 0x1
-; GFX12-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x30
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b32 s4, s[0:1], 0x30
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s4
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_permlanex16_var_b32 v0, v0, v1 op_sel:[1,1]
; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
index b81cb97725648..84edbb8ab3fc1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll
@@ -9,13 +9,13 @@ define amdgpu_kernel void @test_s(ptr addrspace(1) %out, i32 %src0) {
; GFX11-LABEL: test_s:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_permlane64_b32 v0, v0
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -27,12 +27,12 @@ define amdgpu_kernel void @test_s(ptr addrspace(1) %out, i32 %src0) {
define amdgpu_kernel void @test_i(ptr addrspace(1) %out) {
; GFX11-LABEL: test_i:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v0, 0x63 :: v_dual_mov_b32 v1, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_permlane64_b32 v0, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -44,22 +44,22 @@ define amdgpu_kernel void @test_i(ptr addrspace(1) %out) {
define amdgpu_kernel void @test_v(ptr addrspace(1) %out, i32 %src0) #1 {
; GFX11-SDAG-LABEL: test_v:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: test_v:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll
index cb511c93f67ed..bf3d0a5cf5215 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.store.d16.ll
@@ -8,11 +8,11 @@
define amdgpu_kernel void @tbuffer_store_d16_x(ptr addrspace(8) %rsrc, half %data) {
; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_x:
; PREGFX10-UNPACKED: ; %bb.0: ; %main_body
-; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[0:1], 0x34
-; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; PREGFX10-UNPACKED-NEXT: s_load_dword s2, s[0:1], 0x34
+; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0)
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4
-; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED]
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s2
+; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED]
; PREGFX10-UNPACKED-NEXT: s_endpgm
;
; PREGFX10-PACKED-LABEL: tbuffer_store_d16_x:
@@ -37,11 +37,11 @@ define amdgpu_kernel void @tbuffer_store_d16_x(ptr addrspace(8) %rsrc, half %dat
; GFX11-PACKED-LABEL: tbuffer_store_d16_x:
; GFX11-PACKED: ; %bb.0: ; %main_body
; GFX11-PACKED-NEXT: s_clause 0x1
-; GFX11-PACKED-NEXT: s_load_b32 s4, s[0:1], 0x34
-; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-PACKED-NEXT: s_load_b32 s2, s[0:1], 0x34
+; GFX11-PACKED-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[4:7], 0 format:[BUF_FMT_10_10_10_2_SNORM]
; GFX11-PACKED-NEXT: s_nop 0
; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-PACKED-NEXT: s_endpgm
@@ -53,14 +53,14 @@ main_body:
define amdgpu_kernel void @tbuffer_store_d16_xy(ptr addrspace(8) %rsrc, <2 x half> %data) {
; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xy:
; PREGFX10-UNPACKED: ; %bb.0: ; %main_body
-; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[0:1], 0x34
-; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; PREGFX10-UNPACKED-NEXT: s_load_dword s2, s[0:1], 0x34
+; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0)
-; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s5, s4, 16
-; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s5
-; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xy v[0:1], off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED]
+; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s0, s2, 16
+; PREGFX10-UNPACKED-NEXT: s_and_b32 s1, s2, 0xffff
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s1
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s0
+; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xy v[0:1], off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED]
; PREGFX10-UNPACKED-NEXT: s_endpgm
;
; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xy:
@@ -85,11 +85,11 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(ptr addrspace(8) %rsrc, <2 x hal
; GFX11-PACKED-LABEL: tbuffer_store_d16_xy:
; GFX11-PACKED: ; %bb.0: ; %main_body
; GFX11-PACKED-NEXT: s_clause 0x1
-; GFX11-PACKED-NEXT: s_load_b32 s4, s[0:1], 0x34
-; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-PACKED-NEXT: s_load_b32 s2, s[0:1], 0x34
+; GFX11-PACKED-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xy v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xy v0, off, s[4:7], 0 format:[BUF_FMT_10_10_10_2_SNORM]
; GFX11-PACKED-NEXT: s_nop 0
; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-PACKED-NEXT: s_endpgm
@@ -101,16 +101,16 @@ main_body:
define amdgpu_kernel void @tbuffer_store_d16_xyz(ptr addrspace(8) %rsrc, <4 x half> %data) {
; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyz:
; PREGFX10-UNPACKED: ; %bb.0: ; %main_body
-; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0)
-; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff
-; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s6, s4, 16
-; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s6
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s5
-; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyz v[0:2], off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED]
+; PREGFX10-UNPACKED-NEXT: s_and_b32 s0, s3, 0xffff
+; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s1, s2, 16
+; PREGFX10-UNPACKED-NEXT: s_and_b32 s2, s2, 0xffff
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s2
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s1
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s0
+; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyz v[0:2], off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED]
; PREGFX10-UNPACKED-NEXT: s_endpgm
;
; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyz:
@@ -139,13 +139,13 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(ptr addrspace(8) %rsrc, <4 x ha
; GFX11-PACKED-LABEL: tbuffer_store_d16_xyz:
; GFX11-PACKED: ; %bb.0: ; %main_body
; GFX11-PACKED-NEXT: s_clause 0x1
-; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-PACKED-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-PACKED-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-PACKED-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5
-; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyz v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX11-PACKED-NEXT: s_and_b32 s0, s3, 0xffff
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyz v[0:1], off, s[4:7], 0 format:[BUF_FMT_10_10_10_2_SNORM]
; GFX11-PACKED-NEXT: s_nop 0
; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-PACKED-NEXT: s_endpgm
@@ -158,18 +158,18 @@ main_body:
define amdgpu_kernel void @tbuffer_store_d16_xyzw(ptr addrspace(8) %rsrc, <4 x half> %data) {
; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyzw:
; PREGFX10-UNPACKED: ; %bb.0: ; %main_body
-; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0)
-; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s6, s5, 16
-; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff
-; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s7, s4, 16
-; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s7
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s5
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s6
-; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:3], off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED]
+; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s0, s3, 16
+; PREGFX10-UNPACKED-NEXT: s_and_b32 s1, s3, 0xffff
+; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s3, s2, 16
+; PREGFX10-UNPACKED-NEXT: s_and_b32 s2, s2, 0xffff
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s2
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s3
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s1
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s0
+; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:3], off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED]
; PREGFX10-UNPACKED-NEXT: s_endpgm
;
; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyzw:
@@ -196,12 +196,12 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(ptr addrspace(8) %rsrc, <4 x h
; GFX11-PACKED-LABEL: tbuffer_store_d16_xyzw:
; GFX11-PACKED: ; %bb.0: ; %main_body
; GFX11-PACKED-NEXT: s_clause 0x1
-; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-PACKED-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-PACKED-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5
-; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyzw v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyzw v[0:1], off, s[4:7], 0 format:[BUF_FMT_10_10_10_2_SNORM]
; GFX11-PACKED-NEXT: s_nop 0
; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-PACKED-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll
index 01df7634f0e9c..2be7ec2299a22 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.store.d16.ll
@@ -10,11 +10,11 @@
define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data) {
; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_x:
; PREGFX10-UNPACKED: ; %bb.0: ; %main_body
-; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[0:1], 0x34
-; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; PREGFX10-UNPACKED-NEXT: s_load_dword s2, s[0:1], 0x34
+; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0)
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4
-; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED]
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s2
+; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_x v0, off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED]
; PREGFX10-UNPACKED-NEXT: s_endpgm
;
; PREGFX10-PACKED-LABEL: tbuffer_store_d16_x:
@@ -39,11 +39,11 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data) {
; GFX11-PACKED-LABEL: tbuffer_store_d16_x:
; GFX11-PACKED: ; %bb.0: ; %main_body
; GFX11-PACKED-NEXT: s_clause 0x1
-; GFX11-PACKED-NEXT: s_load_b32 s4, s[0:1], 0x34
-; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-PACKED-NEXT: s_load_b32 s2, s[0:1], 0x34
+; GFX11-PACKED-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[4:7], 0 format:[BUF_FMT_10_10_10_2_SNORM]
; GFX11-PACKED-NEXT: s_nop 0
; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-PACKED-NEXT: s_endpgm
@@ -51,11 +51,11 @@ define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data) {
; GFX12-PACKED-LABEL: tbuffer_store_d16_x:
; GFX12-PACKED: ; %bb.0: ; %main_body
; GFX12-PACKED-NEXT: s_clause 0x1
-; GFX12-PACKED-NEXT: s_load_b32 s4, s[0:1], 0x34
-; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-PACKED-NEXT: s_load_b32 s2, s[0:1], 0x34
+; GFX12-PACKED-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0
-; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-PACKED-NEXT: tbuffer_store_d16_format_x v0, off, s[4:7], null format:[BUF_FMT_10_10_10_2_SNORM]
; GFX12-PACKED-NEXT: s_nop 0
; GFX12-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-PACKED-NEXT: s_endpgm
@@ -67,14 +67,14 @@ main_body:
define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %data) {
; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xy:
; PREGFX10-UNPACKED: ; %bb.0: ; %main_body
-; PREGFX10-UNPACKED-NEXT: s_load_dword s4, s[0:1], 0x34
-; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; PREGFX10-UNPACKED-NEXT: s_load_dword s2, s[0:1], 0x34
+; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0)
-; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s5, s4, 16
-; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s5
-; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xy v[0:1], off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED]
+; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s0, s2, 16
+; PREGFX10-UNPACKED-NEXT: s_and_b32 s1, s2, 0xffff
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s1
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s0
+; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xy v[0:1], off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED]
; PREGFX10-UNPACKED-NEXT: s_endpgm
;
; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xy:
@@ -99,11 +99,11 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %dat
; GFX11-PACKED-LABEL: tbuffer_store_d16_xy:
; GFX11-PACKED: ; %bb.0: ; %main_body
; GFX11-PACKED-NEXT: s_clause 0x1
-; GFX11-PACKED-NEXT: s_load_b32 s4, s[0:1], 0x34
-; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-PACKED-NEXT: s_load_b32 s2, s[0:1], 0x34
+; GFX11-PACKED-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xy v0, off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xy v0, off, s[4:7], 0 format:[BUF_FMT_10_10_10_2_SNORM]
; GFX11-PACKED-NEXT: s_nop 0
; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-PACKED-NEXT: s_endpgm
@@ -111,11 +111,11 @@ define amdgpu_kernel void @tbuffer_store_d16_xy(<4 x i32> %rsrc, <2 x half> %dat
; GFX12-PACKED-LABEL: tbuffer_store_d16_xy:
; GFX12-PACKED: ; %bb.0: ; %main_body
; GFX12-PACKED-NEXT: s_clause 0x1
-; GFX12-PACKED-NEXT: s_load_b32 s4, s[0:1], 0x34
-; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-PACKED-NEXT: s_load_b32 s2, s[0:1], 0x34
+; GFX12-PACKED-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0
-; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-PACKED-NEXT: tbuffer_store_d16_format_xy v0, off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-PACKED-NEXT: tbuffer_store_d16_format_xy v0, off, s[4:7], null format:[BUF_FMT_10_10_10_2_SNORM]
; GFX12-PACKED-NEXT: s_nop 0
; GFX12-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-PACKED-NEXT: s_endpgm
@@ -127,16 +127,16 @@ main_body:
define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %data) {
; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyz:
; PREGFX10-UNPACKED: ; %bb.0: ; %main_body
-; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0)
-; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff
-; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s6, s4, 16
-; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s6
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s5
-; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyz v[0:2], off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED]
+; PREGFX10-UNPACKED-NEXT: s_and_b32 s0, s3, 0xffff
+; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s1, s2, 16
+; PREGFX10-UNPACKED-NEXT: s_and_b32 s2, s2, 0xffff
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s2
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s1
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s0
+; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyz v[0:2], off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED]
; PREGFX10-UNPACKED-NEXT: s_endpgm
;
; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyz:
@@ -165,13 +165,13 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da
; GFX11-PACKED-LABEL: tbuffer_store_d16_xyz:
; GFX11-PACKED: ; %bb.0: ; %main_body
; GFX11-PACKED-NEXT: s_clause 0x1
-; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-PACKED-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-PACKED-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-PACKED-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5
-; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyz v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX11-PACKED-NEXT: s_and_b32 s0, s3, 0xffff
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyz v[0:1], off, s[4:7], 0 format:[BUF_FMT_10_10_10_2_SNORM]
; GFX11-PACKED-NEXT: s_nop 0
; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-PACKED-NEXT: s_endpgm
@@ -179,13 +179,13 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da
; GFX12-PACKED-SDAG-LABEL: tbuffer_store_d16_xyz:
; GFX12-PACKED-SDAG: ; %bb.0: ; %main_body
; GFX12-PACKED-SDAG-NEXT: s_clause 0x1
-; GFX12-PACKED-SDAG-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-PACKED-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-PACKED-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-PACKED-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-PACKED-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-PACKED-SDAG-NEXT: s_and_b32 s5, s5, 0xffff
-; GFX12-PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-PACKED-SDAG-NEXT: v_mov_b32_e32 v1, s5
-; GFX12-PACKED-SDAG-NEXT: tbuffer_store_d16_format_xyz v[0:1], off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX12-PACKED-SDAG-NEXT: s_and_b32 s0, s3, 0xffff
+; GFX12-PACKED-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-PACKED-SDAG-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-PACKED-SDAG-NEXT: tbuffer_store_d16_format_xyz v[0:1], off, s[4:7], null format:[BUF_FMT_10_10_10_2_SNORM]
; GFX12-PACKED-SDAG-NEXT: s_nop 0
; GFX12-PACKED-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-PACKED-SDAG-NEXT: s_endpgm
@@ -193,14 +193,14 @@ define amdgpu_kernel void @tbuffer_store_d16_xyz(<4 x i32> %rsrc, <4 x half> %da
; GFX12-PACKED-GISEL-LABEL: tbuffer_store_d16_xyz:
; GFX12-PACKED-GISEL: ; %bb.0: ; %main_body
; GFX12-PACKED-GISEL-NEXT: s_clause 0x1
-; GFX12-PACKED-GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-PACKED-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-PACKED-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-PACKED-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-PACKED-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-PACKED-GISEL-NEXT: s_pack_lh_b32_b16 s4, s4, s4
+; GFX12-PACKED-GISEL-NEXT: s_pack_lh_b32_b16 s2, s2, s2
; GFX12-PACKED-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v1, s5
-; GFX12-PACKED-GISEL-NEXT: tbuffer_store_d16_format_xyzw v[0:1], off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-PACKED-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-PACKED-GISEL-NEXT: tbuffer_store_d16_format_xyzw v[0:1], off, s[4:7], null format:[BUF_FMT_10_10_10_2_SNORM]
; GFX12-PACKED-GISEL-NEXT: s_nop 0
; GFX12-PACKED-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-PACKED-GISEL-NEXT: s_endpgm
@@ -213,18 +213,18 @@ main_body:
define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data) {
; PREGFX10-UNPACKED-LABEL: tbuffer_store_d16_xyzw:
; PREGFX10-UNPACKED: ; %bb.0: ; %main_body
-; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; PREGFX10-UNPACKED-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
+; PREGFX10-UNPACKED-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; PREGFX10-UNPACKED-NEXT: s_waitcnt lgkmcnt(0)
-; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s6, s5, 16
-; PREGFX10-UNPACKED-NEXT: s_and_b32 s5, s5, 0xffff
-; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s7, s4, 16
-; PREGFX10-UNPACKED-NEXT: s_and_b32 s4, s4, 0xffff
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s4
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s7
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s5
-; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s6
-; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:3], off, s[0:3], 0 format:[BUF_NUM_FORMAT_USCALED]
+; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s0, s3, 16
+; PREGFX10-UNPACKED-NEXT: s_and_b32 s1, s3, 0xffff
+; PREGFX10-UNPACKED-NEXT: s_lshr_b32 s3, s2, 16
+; PREGFX10-UNPACKED-NEXT: s_and_b32 s2, s2, 0xffff
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v0, s2
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v1, s3
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v2, s1
+; PREGFX10-UNPACKED-NEXT: v_mov_b32_e32 v3, s0
+; PREGFX10-UNPACKED-NEXT: tbuffer_store_format_d16_xyzw v[0:3], off, s[4:7], 0 format:[BUF_NUM_FORMAT_USCALED]
; PREGFX10-UNPACKED-NEXT: s_endpgm
;
; PREGFX10-PACKED-LABEL: tbuffer_store_d16_xyzw:
@@ -251,12 +251,12 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %d
; GFX11-PACKED-LABEL: tbuffer_store_d16_xyzw:
; GFX11-PACKED: ; %bb.0: ; %main_body
; GFX11-PACKED-NEXT: s_clause 0x1
-; GFX11-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-PACKED-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-PACKED-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-PACKED-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s5
-; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyzw v[0:1], off, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-PACKED-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-PACKED-NEXT: tbuffer_store_d16_format_xyzw v[0:1], off, s[4:7], 0 format:[BUF_FMT_10_10_10_2_SNORM]
; GFX11-PACKED-NEXT: s_nop 0
; GFX11-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-PACKED-NEXT: s_endpgm
@@ -264,12 +264,12 @@ define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %d
; GFX12-PACKED-LABEL: tbuffer_store_d16_xyzw:
; GFX12-PACKED: ; %bb.0: ; %main_body
; GFX12-PACKED-NEXT: s_clause 0x1
-; GFX12-PACKED-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX12-PACKED-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-PACKED-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-PACKED-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-PACKED-NEXT: s_wait_kmcnt 0x0
-; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-PACKED-NEXT: v_mov_b32_e32 v1, s5
-; GFX12-PACKED-NEXT: tbuffer_store_d16_format_xyzw v[0:1], off, s[0:3], null format:[BUF_FMT_10_10_10_2_SNORM]
+; GFX12-PACKED-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-PACKED-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-PACKED-NEXT: tbuffer_store_d16_format_xyzw v[0:1], off, s[4:7], null format:[BUF_FMT_10_10_10_2_SNORM]
; GFX12-PACKED-NEXT: s_nop 0
; GFX12-PACKED-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-PACKED-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
index f52461b6b3807..2dc346a5717dd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
@@ -29,12 +29,12 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
;
; GFX8GISEL-LABEL: uniform_value:
; GFX8GISEL: ; %bb.0: ; %entry
-; GFX8GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX8GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8GISEL-NEXT: s_endpgm
;
@@ -83,12 +83,12 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
; GFX1164DAGISEL-LABEL: uniform_value:
; GFX1164DAGISEL: ; %bb.0: ; %entry
; GFX1164DAGISEL-NEXT: s_clause 0x1
-; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX1164DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1164DAGISEL-NEXT: s_nop 0
; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164DAGISEL-NEXT: s_endpgm
@@ -96,12 +96,12 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
; GFX1164GISEL-LABEL: uniform_value:
; GFX1164GISEL: ; %bb.0: ; %entry
; GFX1164GISEL-NEXT: s_clause 0x1
-; GFX1164GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164GISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX1164GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1164GISEL-NEXT: s_nop 0
; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164GISEL-NEXT: s_endpgm
@@ -109,11 +109,11 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
; GFX1132DAGISEL-LABEL: uniform_value:
; GFX1132DAGISEL: ; %bb.0: ; %entry
; GFX1132DAGISEL-NEXT: s_clause 0x1
-; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132DAGISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX1132DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1132DAGISEL-NEXT: s_nop 0
; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132DAGISEL-NEXT: s_endpgm
@@ -121,11 +121,11 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-LABEL: uniform_value:
; GFX1132GISEL: ; %bb.0: ; %entry
; GFX1132GISEL-NEXT: s_clause 0x1
-; GFX1132GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132GISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1132GISEL-NEXT: s_nop 0
; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132GISEL-NEXT: s_endpgm
@@ -138,98 +138,98 @@ entry:
define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
; GFX8DAGISEL-LABEL: const_value:
; GFX8DAGISEL: ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, 0x7b
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8DAGISEL-NEXT: s_endpgm
;
; GFX8GISEL-LABEL: const_value:
; GFX8GISEL: ; %bb.0: ; %entry
-; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, 0x7b
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8GISEL-NEXT: s_endpgm
;
; GFX9DAGISEL-LABEL: const_value:
; GFX9DAGISEL: ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9DAGISEL-NEXT: s_endpgm
;
; GFX9GISEL-LABEL: const_value:
; GFX9GISEL: ; %bb.0: ; %entry
-; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX9GISEL-NEXT: s_endpgm
;
; GFX10DAGISEL-LABEL: const_value:
; GFX10DAGISEL: ; %bb.0: ; %entry
-; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10DAGISEL-NEXT: s_endpgm
;
; GFX10GISEL-LABEL: const_value:
; GFX10GISEL: ; %bb.0: ; %entry
-; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10GISEL-NEXT: s_endpgm
;
; GFX1164DAGISEL-LABEL: const_value:
; GFX1164DAGISEL: ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1164DAGISEL-NEXT: s_nop 0
; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164DAGISEL-NEXT: s_endpgm
;
; GFX1164GISEL-LABEL: const_value:
; GFX1164GISEL: ; %bb.0: ; %entry
-; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1164GISEL-NEXT: s_nop 0
; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164GISEL-NEXT: s_endpgm
;
; GFX1132DAGISEL-LABEL: const_value:
; GFX1132DAGISEL: ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1132DAGISEL-NEXT: s_nop 0
; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132DAGISEL-NEXT: s_endpgm
;
; GFX1132GISEL-LABEL: const_value:
; GFX1132GISEL: ; %bb.0: ; %entry
-; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1132GISEL-NEXT: s_nop 0
; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132GISEL-NEXT: s_endpgm
@@ -280,241 +280,241 @@ entry:
define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) {
; GFX8DAGISEL-LABEL: divergent_value:
; GFX8DAGISEL: ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX8DAGISEL-NEXT: s_mov_b32 s4, 0
; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX8DAGISEL-NEXT: s_max_u32 s4, s4, s6
-; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX8DAGISEL-NEXT: ; %bb.2:
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8DAGISEL-NEXT: s_endpgm
;
; GFX8GISEL-LABEL: divergent_value:
; GFX8GISEL: ; %bb.0: ; %entry
-; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX8GISEL-NEXT: s_mov_b32 s4, 0
; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8GISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX8GISEL-NEXT: s_max_u32 s4, s4, s6
-; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX8GISEL-NEXT: ; %bb.2:
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
-; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8GISEL-NEXT: s_endpgm
;
; GFX9DAGISEL-LABEL: divergent_value:
; GFX9DAGISEL: ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX9DAGISEL-NEXT: s_mov_b32 s4, 0
; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX9DAGISEL-NEXT: s_max_u32 s4, s4, s6
-; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9DAGISEL-NEXT: ; %bb.2:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX9DAGISEL-NEXT: s_endpgm
;
; GFX9GISEL-LABEL: divergent_value:
; GFX9GISEL: ; %bb.0: ; %entry
-; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX9GISEL-NEXT: s_mov_b32 s4, 0
; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9GISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX9GISEL-NEXT: s_max_u32 s4, s4, s6
-; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9GISEL-NEXT: ; %bb.2:
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX9GISEL-NEXT: s_endpgm
;
; GFX1064DAGISEL-LABEL: divergent_value:
; GFX1064DAGISEL: ; %bb.0: ; %entry
-; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX1064DAGISEL-NEXT: s_mov_b32 s4, 0
; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX1064DAGISEL-NEXT: s_max_u32 s4, s4, s6
-; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064DAGISEL-NEXT: ; %bb.2:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX1064DAGISEL-NEXT: s_endpgm
;
; GFX1064GISEL-LABEL: divergent_value:
; GFX1064GISEL: ; %bb.0: ; %entry
-; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX1064GISEL-NEXT: s_mov_b32 s4, 0
; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX1064GISEL-NEXT: s_max_u32 s4, s4, s6
-; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064GISEL-NEXT: ; %bb.2:
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX1064GISEL-NEXT: s_endpgm
;
; GFX1032DAGISEL-LABEL: divergent_value:
; GFX1032DAGISEL: ; %bb.0: ; %entry
-; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032DAGISEL-NEXT: s_mov_b32 s2, 0
+; GFX1032DAGISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s0, 0
; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s1
; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1032DAGISEL-NEXT: s_max_u32 s2, s2, s5
-; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s1, s4
+; GFX1032DAGISEL-NEXT: s_max_u32 s0, s0, s5
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032DAGISEL-NEXT: ; %bb.2:
-; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX1032DAGISEL-NEXT: s_endpgm
;
; GFX1032GISEL-LABEL: divergent_value:
; GFX1032GISEL: ; %bb.0: ; %entry
-; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032GISEL-NEXT: s_mov_b32 s2, 0
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s0, 0
; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s1
; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1032GISEL-NEXT: s_max_u32 s2, s2, s5
-; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032GISEL-NEXT: s_bitset0_b32 s1, s4
+; GFX1032GISEL-NEXT: s_max_u32 s0, s0, s5
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032GISEL-NEXT: ; %bb.2:
-; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX1032GISEL-NEXT: s_endpgm
;
; GFX1164DAGISEL-LABEL: divergent_value:
; GFX1164DAGISEL: ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX1164DAGISEL-NEXT: s_mov_b32 s4, 0
; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[0:1]
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX1164DAGISEL-NEXT: s_max_u32 s4, s4, s6
-; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164DAGISEL-NEXT: ; %bb.2:
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1164DAGISEL-NEXT: s_nop 0
; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164DAGISEL-NEXT: s_endpgm
;
; GFX1164GISEL-LABEL: divergent_value:
; GFX1164GISEL: ; %bb.0: ; %entry
-; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX1164GISEL-NEXT: s_mov_b32 s4, 0
; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[0:1]
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX1164GISEL-NEXT: s_max_u32 s4, s4, s6
-; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164GISEL-NEXT: ; %bb.2:
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1164GISEL-NEXT: s_nop 0
; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164GISEL-NEXT: s_endpgm
;
; GFX1132DAGISEL-LABEL: divergent_value:
; GFX1132DAGISEL: ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132DAGISEL-NEXT: s_mov_b32 s2, 0
+; GFX1132DAGISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132DAGISEL-NEXT: s_mov_b32 s0, 0
; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s1
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1132DAGISEL-NEXT: s_max_u32 s2, s2, s5
-; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s1, s4
+; GFX1132DAGISEL-NEXT: s_max_u32 s0, s0, s5
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132DAGISEL-NEXT: ; %bb.2:
-; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1132DAGISEL-NEXT: s_nop 0
; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132DAGISEL-NEXT: s_endpgm
;
; GFX1132GISEL-LABEL: divergent_value:
; GFX1132GISEL: ; %bb.0: ; %entry
-; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132GISEL-NEXT: s_mov_b32 s2, 0
+; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132GISEL-NEXT: s_mov_b32 s0, 0
; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s1
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1132GISEL-NEXT: s_max_u32 s2, s2, s5
-; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132GISEL-NEXT: s_bitset0_b32 s1, s4
+; GFX1132GISEL-NEXT: s_max_u32 s0, s0, s5
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132GISEL-NEXT: ; %bb.2:
-; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1132GISEL-NEXT: s_nop 0
; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132GISEL-NEXT: s_endpgm
@@ -556,10 +556,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6
; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif
; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s3
; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1
; GFX8DAGISEL-NEXT: s_endpgm
;
@@ -590,11 +590,11 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4
; GFX8GISEL-NEXT: .LBB4_5: ; %endif
; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8GISEL-NEXT: s_endpgm
;
@@ -628,10 +628,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6
; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif
; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9DAGISEL-NEXT: s_endpgm
;
; GFX9GISEL-LABEL: divergent_cfg:
@@ -661,11 +661,11 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4
; GFX9GISEL-NEXT: .LBB4_5: ; %endif
; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX9GISEL-NEXT: s_endpgm
;
; GFX1064DAGISEL-LABEL: divergent_cfg:
@@ -698,10 +698,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6
; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif
; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX1064DAGISEL-NEXT: s_endpgm
;
; GFX1064GISEL-LABEL: divergent_cfg:
@@ -731,11 +731,11 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4
; GFX1064GISEL-NEXT: .LBB4_5: ; %endif
; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX1064GISEL-NEXT: s_endpgm
;
; GFX1032DAGISEL-LABEL: divergent_cfg:
@@ -768,10 +768,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif
; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX1032DAGISEL-NEXT: s_endpgm
;
; GFX1032GISEL-LABEL: divergent_cfg:
@@ -801,11 +801,11 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4
; GFX1032GISEL-NEXT: .LBB4_5: ; %endif
; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[4:5]
; GFX1032GISEL-NEXT: s_endpgm
;
; GFX1164DAGISEL-LABEL: divergent_cfg:
@@ -839,10 +839,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6
; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif
; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1164DAGISEL-NEXT: s_nop 0
; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164DAGISEL-NEXT: s_endpgm
@@ -875,11 +875,11 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4
; GFX1164GISEL-NEXT: .LBB4_5: ; %endif
; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1164GISEL-NEXT: s_nop 0
; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164GISEL-NEXT: s_endpgm
@@ -915,10 +915,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif
; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1132DAGISEL-NEXT: s_nop 0
; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132DAGISEL-NEXT: s_endpgm
@@ -951,10 +951,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4
; GFX1132GISEL-NEXT: .LBB4_5: ; %endif
; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX1132GISEL-NEXT: s_nop 0
; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
index bfdb2da6dc6a4..bfae6f0f26023 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
@@ -30,12 +30,12 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
;
; GFX8GISEL-LABEL: uniform_value:
; GFX8GISEL: ; %bb.0: ; %entry
-; GFX8GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX8GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8GISEL-NEXT: s_endpgm
;
@@ -84,12 +84,12 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
; GFX1164DAGISEL-LABEL: uniform_value:
; GFX1164DAGISEL: ; %bb.0: ; %entry
; GFX1164DAGISEL-NEXT: s_clause 0x1
-; GFX1164DAGISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX1164DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s2
-; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1164DAGISEL-NEXT: s_nop 0
; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164DAGISEL-NEXT: s_endpgm
@@ -97,12 +97,12 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
; GFX1164GISEL-LABEL: uniform_value:
; GFX1164GISEL: ; %bb.0: ; %entry
; GFX1164GISEL-NEXT: s_clause 0x1
-; GFX1164GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164GISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX1164GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1164GISEL-NEXT: s_nop 0
; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164GISEL-NEXT: s_endpgm
@@ -110,11 +110,11 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
; GFX1132DAGISEL-LABEL: uniform_value:
; GFX1132DAGISEL: ; %bb.0: ; %entry
; GFX1132DAGISEL-NEXT: s_clause 0x1
-; GFX1132DAGISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132DAGISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX1132DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1132DAGISEL-NEXT: s_nop 0
; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132DAGISEL-NEXT: s_endpgm
@@ -122,11 +122,11 @@ define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-LABEL: uniform_value:
; GFX1132GISEL: ; %bb.0: ; %entry
; GFX1132GISEL-NEXT: s_clause 0x1
-; GFX1132GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132GISEL-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1132GISEL-NEXT: s_nop 0
; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132GISEL-NEXT: s_endpgm
@@ -139,98 +139,98 @@ entry:
define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
; GFX8DAGISEL-LABEL: const_value:
; GFX8DAGISEL: ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, 0x7b
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8DAGISEL-NEXT: s_endpgm
;
; GFX8GISEL-LABEL: const_value:
; GFX8GISEL: ; %bb.0: ; %entry
-; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, 0x7b
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8GISEL-NEXT: s_endpgm
;
; GFX9DAGISEL-LABEL: const_value:
; GFX9DAGISEL: ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9DAGISEL-NEXT: s_endpgm
;
; GFX9GISEL-LABEL: const_value:
; GFX9GISEL: ; %bb.0: ; %entry
-; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX9GISEL-NEXT: s_endpgm
;
; GFX10DAGISEL-LABEL: const_value:
; GFX10DAGISEL: ; %bb.0: ; %entry
-; GFX10DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX10DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
; GFX10DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10DAGISEL-NEXT: s_endpgm
;
; GFX10GISEL-LABEL: const_value:
; GFX10GISEL: ; %bb.0: ; %entry
-; GFX10GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX10GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX10GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10GISEL-NEXT: s_endpgm
;
; GFX1164DAGISEL-LABEL: const_value:
; GFX1164DAGISEL: ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0x7b
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1164DAGISEL-NEXT: s_nop 0
; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164DAGISEL-NEXT: s_endpgm
;
; GFX1164GISEL-LABEL: const_value:
; GFX1164GISEL: ; %bb.0: ; %entry
-; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1164GISEL-NEXT: s_nop 0
; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164GISEL-NEXT: s_endpgm
;
; GFX1132DAGISEL-LABEL: const_value:
; GFX1132DAGISEL: ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132DAGISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1132DAGISEL-NEXT: s_nop 0
; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132DAGISEL-NEXT: s_endpgm
;
; GFX1132GISEL-LABEL: const_value:
; GFX1132GISEL: ; %bb.0: ; %entry
-; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1132GISEL-NEXT: s_nop 0
; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132GISEL-NEXT: s_endpgm
@@ -281,241 +281,241 @@ entry:
define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) {
; GFX8DAGISEL-LABEL: divergent_value:
; GFX8DAGISEL: ; %bb.0: ; %entry
-; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8DAGISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX8DAGISEL-NEXT: s_mov_b32 s4, -1
; GFX8DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8DAGISEL-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX8DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX8DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8DAGISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX8DAGISEL-NEXT: s_min_u32 s4, s4, s6
-; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8DAGISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX8DAGISEL-NEXT: ; %bb.2:
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX8DAGISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8DAGISEL-NEXT: s_endpgm
;
; GFX8GISEL-LABEL: divergent_value:
; GFX8GISEL: ; %bb.0: ; %entry
-; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8GISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX8GISEL-NEXT: s_mov_b32 s4, -1
; GFX8GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX8GISEL-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX8GISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX8GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX8GISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX8GISEL-NEXT: s_min_u32 s4, s4, s6
-; GFX8GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX8GISEL-NEXT: ; %bb.2:
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s4
-; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8GISEL-NEXT: s_endpgm
;
; GFX9DAGISEL-LABEL: divergent_value:
; GFX9DAGISEL: ; %bb.0: ; %entry
-; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX9DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX9DAGISEL-NEXT: s_mov_b32 s4, -1
; GFX9DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9DAGISEL-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX9DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX9DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9DAGISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX9DAGISEL-NEXT: s_min_u32 s4, s4, s6
-; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9DAGISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9DAGISEL-NEXT: ; %bb.2:
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9DAGISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX9DAGISEL-NEXT: s_endpgm
;
; GFX9GISEL-LABEL: divergent_value:
; GFX9GISEL: ; %bb.0: ; %entry
-; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9GISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX9GISEL-NEXT: s_mov_b32 s4, -1
; GFX9GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX9GISEL-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX9GISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX9GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX9GISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX9GISEL-NEXT: s_min_u32 s4, s4, s6
-; GFX9GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX9GISEL-NEXT: ; %bb.2:
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX9GISEL-NEXT: s_endpgm
;
; GFX1064DAGISEL-LABEL: divergent_value:
; GFX1064DAGISEL: ; %bb.0: ; %entry
-; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX1064DAGISEL-NEXT: s_mov_b32 s4, -1
; GFX1064DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064DAGISEL-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX1064DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064DAGISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX1064DAGISEL-NEXT: s_min_u32 s4, s4, s6
-; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064DAGISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064DAGISEL-NEXT: ; %bb.2:
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064DAGISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX1064DAGISEL-NEXT: s_endpgm
;
; GFX1064GISEL-LABEL: divergent_value:
; GFX1064GISEL: ; %bb.0: ; %entry
-; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1064GISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX1064GISEL-NEXT: s_mov_b32 s4, -1
; GFX1064GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[2:3]
+; GFX1064GISEL-NEXT: s_ff1_i32_b64 s5, s[0:1]
; GFX1064GISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX1064GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1064GISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX1064GISEL-NEXT: s_min_u32 s4, s4, s6
-; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1064GISEL-NEXT: ; %bb.2:
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX1064GISEL-NEXT: s_endpgm
;
; GFX1032DAGISEL-LABEL: divergent_value:
; GFX1032DAGISEL: ; %bb.0: ; %entry
-; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032DAGISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032DAGISEL-NEXT: s_mov_b32 s2, -1
+; GFX1032DAGISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1032DAGISEL-NEXT: s_mov_b32 s0, -1
; GFX1032DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032DAGISEL-NEXT: s_ff1_i32_b32 s4, s1
; GFX1032DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1032DAGISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1032DAGISEL-NEXT: s_min_u32 s2, s2, s5
-; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032DAGISEL-NEXT: s_bitset0_b32 s1, s4
+; GFX1032DAGISEL-NEXT: s_min_u32 s0, s0, s5
+; GFX1032DAGISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032DAGISEL-NEXT: ; %bb.2:
-; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032DAGISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX1032DAGISEL-NEXT: s_endpgm
;
; GFX1032GISEL-LABEL: divergent_value:
; GFX1032GISEL: ; %bb.0: ; %entry
-; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032GISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1032GISEL-NEXT: s_mov_b32 s2, -1
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX1032GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1032GISEL-NEXT: s_mov_b32 s0, -1
; GFX1032GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s3
+; GFX1032GISEL-NEXT: s_ff1_i32_b32 s4, s1
; GFX1032GISEL-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1032GISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1032GISEL-NEXT: s_min_u32 s2, s2, s5
-; GFX1032GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1032GISEL-NEXT: s_bitset0_b32 s1, s4
+; GFX1032GISEL-NEXT: s_min_u32 s0, s0, s5
+; GFX1032GISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1032GISEL-NEXT: ; %bb.2:
-; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX1032GISEL-NEXT: s_endpgm
;
; GFX1164DAGISEL-LABEL: divergent_value:
; GFX1164DAGISEL: ; %bb.0: ; %entry
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1164DAGISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX1164DAGISEL-NEXT: s_mov_b32 s4, -1
; GFX1164DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164DAGISEL-NEXT: s_ctz_i32_b64 s5, s[0:1]
; GFX1164DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1164DAGISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164DAGISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX1164DAGISEL-NEXT: s_min_u32 s4, s4, s6
-; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164DAGISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164DAGISEL-NEXT: ; %bb.2:
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164DAGISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1164DAGISEL-NEXT: s_nop 0
; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164DAGISEL-NEXT: s_endpgm
;
; GFX1164GISEL-LABEL: divergent_value:
; GFX1164GISEL: ; %bb.0: ; %entry
-; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1164GISEL-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1164GISEL-NEXT: s_mov_b64 s[0:1], exec
; GFX1164GISEL-NEXT: s_mov_b32 s4, -1
; GFX1164GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[2:3]
+; GFX1164GISEL-NEXT: s_ctz_i32_b64 s5, s[0:1]
; GFX1164GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1164GISEL-NEXT: v_readlane_b32 s6, v0, s5
-; GFX1164GISEL-NEXT: s_bitset0_b64 s[2:3], s5
+; GFX1164GISEL-NEXT: s_bitset0_b64 s[0:1], s5
; GFX1164GISEL-NEXT: s_min_u32 s4, s4, s6
-; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1164GISEL-NEXT: ; %bb.2:
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1164GISEL-NEXT: s_nop 0
; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164GISEL-NEXT: s_endpgm
;
; GFX1132DAGISEL-LABEL: divergent_value:
; GFX1132DAGISEL: ; %bb.0: ; %entry
-; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1132DAGISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132DAGISEL-NEXT: s_mov_b32 s2, -1
+; GFX1132DAGISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132DAGISEL-NEXT: s_mov_b32 s0, -1
; GFX1132DAGISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132DAGISEL-NEXT: s_ctz_i32_b32 s4, s1
; GFX1132DAGISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132DAGISEL-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1132DAGISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1132DAGISEL-NEXT: s_min_u32 s2, s2, s5
-; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132DAGISEL-NEXT: s_bitset0_b32 s1, s4
+; GFX1132DAGISEL-NEXT: s_min_u32 s0, s0, s5
+; GFX1132DAGISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132DAGISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132DAGISEL-NEXT: ; %bb.2:
-; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132DAGISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1132DAGISEL-NEXT: s_nop 0
; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132DAGISEL-NEXT: s_endpgm
;
; GFX1132GISEL-LABEL: divergent_value:
; GFX1132GISEL: ; %bb.0: ; %entry
-; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX1132GISEL-NEXT: s_mov_b32 s3, exec_lo
-; GFX1132GISEL-NEXT: s_mov_b32 s2, -1
+; GFX1132GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1132GISEL-NEXT: s_mov_b32 s1, exec_lo
+; GFX1132GISEL-NEXT: s_mov_b32 s0, -1
; GFX1132GISEL-NEXT: .LBB3_1: ; =>This Inner Loop Header: Depth=1
-; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s3
+; GFX1132GISEL-NEXT: s_ctz_i32_b32 s4, s1
; GFX1132GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1132GISEL-NEXT: v_readlane_b32 s5, v0, s4
-; GFX1132GISEL-NEXT: s_bitset0_b32 s3, s4
-; GFX1132GISEL-NEXT: s_min_u32 s2, s2, s5
-; GFX1132GISEL-NEXT: s_cmp_lg_u32 s3, 0
+; GFX1132GISEL-NEXT: s_bitset0_b32 s1, s4
+; GFX1132GISEL-NEXT: s_min_u32 s0, s0, s5
+; GFX1132GISEL-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB3_1
; GFX1132GISEL-NEXT: ; %bb.2:
-; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1132GISEL-NEXT: s_nop 0
; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132GISEL-NEXT: s_endpgm
@@ -557,10 +557,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s6
; GFX8DAGISEL-NEXT: .LBB4_6: ; %endif
; GFX8DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s1
-; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT: v_mov_b32_e32 v3, s3
; GFX8DAGISEL-NEXT: flat_store_dword v[2:3], v1
; GFX8DAGISEL-NEXT: s_endpgm
;
@@ -591,11 +591,11 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX8GISEL-NEXT: s_cbranch_scc1 .LBB4_4
; GFX8GISEL-NEXT: .LBB4_5: ; %endif
; GFX8GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX8GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8GISEL-NEXT: v_mov_b32_e32 v2, s6
; GFX8GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX8GISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX8GISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8GISEL-NEXT: s_endpgm
;
@@ -629,10 +629,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s6
; GFX9DAGISEL-NEXT: .LBB4_6: ; %endif
; GFX9DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9DAGISEL-NEXT: s_endpgm
;
; GFX9GISEL-LABEL: divergent_cfg:
@@ -662,11 +662,11 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX9GISEL-NEXT: s_cbranch_scc1 .LBB4_4
; GFX9GISEL-NEXT: .LBB4_5: ; %endif
; GFX9GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX9GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX9GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX9GISEL-NEXT: s_endpgm
;
; GFX1064DAGISEL-LABEL: divergent_cfg:
@@ -699,10 +699,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s6
; GFX1064DAGISEL-NEXT: .LBB4_6: ; %endif
; GFX1064DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX1064DAGISEL-NEXT: s_endpgm
;
; GFX1064GISEL-LABEL: divergent_cfg:
@@ -732,11 +732,11 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1064GISEL-NEXT: s_cbranch_scc1 .LBB4_4
; GFX1064GISEL-NEXT: .LBB4_5: ; %endif
; GFX1064GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1064GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1064GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1064GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX1064GISEL-NEXT: s_endpgm
;
; GFX1032DAGISEL-LABEL: divergent_cfg:
@@ -769,10 +769,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX1032DAGISEL-NEXT: .LBB4_6: ; %endif
; GFX1032DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032DAGISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT: global_store_dword v0, v1, s[2:3]
; GFX1032DAGISEL-NEXT: s_endpgm
;
; GFX1032GISEL-LABEL: divergent_cfg:
@@ -802,11 +802,11 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1032GISEL-NEXT: s_cbranch_scc1 .LBB4_4
; GFX1032GISEL-NEXT: .LBB4_5: ; %endif
; GFX1032GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1032GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX1032GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX1032GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1032GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL-NEXT: global_store_dword v1, v0, s[4:5]
; GFX1032GISEL-NEXT: s_endpgm
;
; GFX1164DAGISEL-LABEL: divergent_cfg:
@@ -840,10 +840,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s6
; GFX1164DAGISEL-NEXT: .LBB4_6: ; %endif
; GFX1164DAGISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1164DAGISEL-NEXT: s_nop 0
; GFX1164DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164DAGISEL-NEXT: s_endpgm
@@ -876,11 +876,11 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1164GISEL-NEXT: s_cbranch_scc1 .LBB4_4
; GFX1164GISEL-NEXT: .LBB4_5: ; %endif
; GFX1164GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX1164GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1164GISEL-NEXT: v_mov_b32_e32 v0, s6
; GFX1164GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX1164GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1164GISEL-NEXT: s_nop 0
; GFX1164GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164GISEL-NEXT: s_endpgm
@@ -916,10 +916,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s3
; GFX1132DAGISEL-NEXT: .LBB4_6: ; %endif
; GFX1132DAGISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
-; GFX1132DAGISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132DAGISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX1132DAGISEL-NEXT: s_nop 0
; GFX1132DAGISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132DAGISEL-NEXT: s_endpgm
@@ -952,10 +952,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
; GFX1132GISEL-NEXT: s_cbranch_scc1 .LBB4_4
; GFX1132GISEL-NEXT: .LBB4_5: ; %endif
; GFX1132GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s3
-; GFX1132GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132GISEL-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX1132GISEL-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
; GFX1132GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX1132GISEL-NEXT: s_nop 0
; GFX1132GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
index 3eb226193051d..e0340767ac651 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
@@ -5,34 +5,34 @@
define amdgpu_kernel void @test1_s_barrier_signal(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test1_s_barrier_signal:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v3, v2, s[0:1]
+; GCN-NEXT: global_store_b32 v3, v2, s[2:3]
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal -1
; GCN-NEXT: s_barrier_wait -1
-; GCN-NEXT: global_store_b32 v3, v0, s[0:1]
+; GCN-NEXT: global_store_b32 v3, v0, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test1_s_barrier_signal:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3]
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal -1
; GLOBAL-ISEL-NEXT: s_barrier_wait -1
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -51,34 +51,34 @@ entry:
define amdgpu_kernel void @test2_s_barrier_signal(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test2_s_barrier_signal:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v3, v2, s[0:1]
+; GCN-NEXT: global_store_b32 v3, v2, s[2:3]
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal 1
; GCN-NEXT: s_barrier_wait 1
-; GCN-NEXT: global_store_b32 v3, v0, s[0:1]
+; GCN-NEXT: global_store_b32 v3, v0, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test2_s_barrier_signal:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3]
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal 1
; GLOBAL-ISEL-NEXT: s_barrier_wait 1
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -97,34 +97,34 @@ entry:
define amdgpu_kernel void @test3_s_barrier_signal(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test3_s_barrier_signal:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v3, v2, s[0:1]
+; GCN-NEXT: global_store_b32 v3, v2, s[2:3]
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal 0
; GCN-NEXT: s_barrier_wait 0
-; GCN-NEXT: global_store_b32 v3, v0, s[0:1]
+; GCN-NEXT: global_store_b32 v3, v0, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test3_s_barrier_signal:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3]
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal 0
; GLOBAL-ISEL-NEXT: s_barrier_wait 0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -143,7 +143,7 @@ entry:
define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test1_s_barrier_signal_var:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mul_u32_u24_e32 v2, v0, v0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0
@@ -151,29 +151,29 @@ define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GCN-NEXT: v_sub_nc_u32_e32 v0, v2, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v3, v1, s[0:1]
+; GCN-NEXT: global_store_b32 v3, v1, s[2:3]
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal m0
; GCN-NEXT: s_barrier_wait 1
-; GCN-NEXT: global_store_b32 v3, v0, s[0:1]
+; GCN-NEXT: global_store_b32 v3, v0, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test1_s_barrier_signal_var:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GLOBAL-ISEL-NEXT: s_mov_b32 m0, 1
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3]
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal m0
; GLOBAL-ISEL-NEXT: s_barrier_wait 1
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -222,43 +222,43 @@ define void @test2_s_barrier_signal_var(i32 %arg) {
define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 {
; GCN-LABEL: test1_s_barrier_signal_isfirst:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GCN-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: global_store_b32 v0, v1, s[10:11]
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal_isfirst -1
-; GCN-NEXT: s_cselect_b32 s3, s3, s5
-; GCN-NEXT: s_cselect_b32 s2, s2, s4
+; GCN-NEXT: s_cselect_b32 s1, s7, s9
+; GCN-NEXT: s_cselect_b32 s0, s6, s8
; GCN-NEXT: s_clause 0x1
-; GCN-NEXT: global_load_b32 v2, v1, s[0:1]
-; GCN-NEXT: global_load_b32 v1, v1, s[2:3]
+; GCN-NEXT: global_load_b32 v2, v1, s[4:5]
+; GCN-NEXT: global_load_b32 v1, v1, s[0:1]
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: v_mul_lo_u32 v1, v1, v2
-; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: global_store_b32 v0, v1, s[10:11]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test1_s_barrier_signal_isfirst:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[10:11]
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst -1
-; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0
+; GLOBAL-ISEL-NEXT: s_cselect_b32 s0, 1, 0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GLOBAL-ISEL-NEXT: s_and_b32 s8, s8, 1
-; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s8, 0
-; GLOBAL-ISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GLOBAL-ISEL-NEXT: s_and_b32 s0, s0, 1
+; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s0, 0
+; GLOBAL-ISEL-NEXT: s_cselect_b64 s[0:1], s[6:7], s[8:9]
; GLOBAL-ISEL-NEXT: s_clause 0x1
-; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[0:1]
-; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[2:3]
+; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[4:5]
+; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[0:1]
; GLOBAL-ISEL-NEXT: s_wait_loadcnt 0x0
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v1, v2
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[10:11]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -278,43 +278,43 @@ entry:
define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 {
; GCN-LABEL: test2_s_barrier_signal_isfirst:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GCN-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: global_store_b32 v0, v1, s[10:11]
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal_isfirst 1
-; GCN-NEXT: s_cselect_b32 s3, s3, s5
-; GCN-NEXT: s_cselect_b32 s2, s2, s4
+; GCN-NEXT: s_cselect_b32 s1, s7, s9
+; GCN-NEXT: s_cselect_b32 s0, s6, s8
; GCN-NEXT: s_clause 0x1
-; GCN-NEXT: global_load_b32 v2, v1, s[0:1]
-; GCN-NEXT: global_load_b32 v1, v1, s[2:3]
+; GCN-NEXT: global_load_b32 v2, v1, s[4:5]
+; GCN-NEXT: global_load_b32 v1, v1, s[0:1]
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: v_mul_lo_u32 v1, v1, v2
-; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: global_store_b32 v0, v1, s[10:11]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test2_s_barrier_signal_isfirst:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[10:11]
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst 1
-; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0
+; GLOBAL-ISEL-NEXT: s_cselect_b32 s0, 1, 0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GLOBAL-ISEL-NEXT: s_and_b32 s8, s8, 1
-; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s8, 0
-; GLOBAL-ISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GLOBAL-ISEL-NEXT: s_and_b32 s0, s0, 1
+; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s0, 0
+; GLOBAL-ISEL-NEXT: s_cselect_b64 s[0:1], s[6:7], s[8:9]
; GLOBAL-ISEL-NEXT: s_clause 0x1
-; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[0:1]
-; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[2:3]
+; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[4:5]
+; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[0:1]
; GLOBAL-ISEL-NEXT: s_wait_loadcnt 0x0
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v1, v2
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[10:11]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -334,43 +334,43 @@ entry:
define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 {
; GCN-LABEL: test3_s_barrier_signal_isfirst:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GCN-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: global_store_b32 v0, v1, s[10:11]
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal_isfirst 1
-; GCN-NEXT: s_cselect_b32 s3, s3, s5
-; GCN-NEXT: s_cselect_b32 s2, s2, s4
+; GCN-NEXT: s_cselect_b32 s1, s7, s9
+; GCN-NEXT: s_cselect_b32 s0, s6, s8
; GCN-NEXT: s_clause 0x1
-; GCN-NEXT: global_load_b32 v2, v1, s[0:1]
-; GCN-NEXT: global_load_b32 v1, v1, s[2:3]
+; GCN-NEXT: global_load_b32 v2, v1, s[4:5]
+; GCN-NEXT: global_load_b32 v1, v1, s[0:1]
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: v_mul_lo_u32 v1, v1, v2
-; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: global_store_b32 v0, v1, s[10:11]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test3_s_barrier_signal_isfirst:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[10:11]
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst 1
-; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0
+; GLOBAL-ISEL-NEXT: s_cselect_b32 s0, 1, 0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GLOBAL-ISEL-NEXT: s_and_b32 s8, s8, 1
-; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s8, 0
-; GLOBAL-ISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GLOBAL-ISEL-NEXT: s_and_b32 s0, s0, 1
+; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s0, 0
+; GLOBAL-ISEL-NEXT: s_cselect_b64 s[0:1], s[6:7], s[8:9]
; GLOBAL-ISEL-NEXT: s_clause 0x1
-; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[0:1]
-; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[2:3]
+; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[4:5]
+; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[0:1]
; GLOBAL-ISEL-NEXT: s_wait_loadcnt 0x0
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v1, v2
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[10:11]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -390,45 +390,45 @@ entry:
define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 {
; GCN-LABEL: test1_s_barrier_signal_isfirst_var:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GCN-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GCN-NEXT: s_mov_b32 m0, 1
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: global_store_b32 v0, v1, s[10:11]
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal_isfirst m0
-; GCN-NEXT: s_cselect_b32 s3, s3, s5
-; GCN-NEXT: s_cselect_b32 s2, s2, s4
+; GCN-NEXT: s_cselect_b32 s1, s7, s9
+; GCN-NEXT: s_cselect_b32 s0, s6, s8
; GCN-NEXT: s_clause 0x1
-; GCN-NEXT: global_load_b32 v2, v1, s[0:1]
-; GCN-NEXT: global_load_b32 v1, v1, s[2:3]
+; GCN-NEXT: global_load_b32 v2, v1, s[4:5]
+; GCN-NEXT: global_load_b32 v1, v1, s[0:1]
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: v_mul_lo_u32 v1, v1, v2
-; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: global_store_b32 v0, v1, s[10:11]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test1_s_barrier_signal_isfirst_var:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GLOBAL-ISEL-NEXT: s_mov_b32 m0, 1
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[10:11]
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal_isfirst m0
-; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0
+; GLOBAL-ISEL-NEXT: s_cselect_b32 s0, 1, 0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GLOBAL-ISEL-NEXT: s_and_b32 s8, s8, 1
-; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s8, 0
-; GLOBAL-ISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GLOBAL-ISEL-NEXT: s_and_b32 s0, s0, 1
+; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s0, 0
+; GLOBAL-ISEL-NEXT: s_cselect_b64 s[0:1], s[6:7], s[8:9]
; GLOBAL-ISEL-NEXT: s_clause 0x1
-; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[0:1]
-; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[2:3]
+; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[4:5]
+; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[0:1]
; GLOBAL-ISEL-NEXT: s_wait_loadcnt 0x0
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v1, v2
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[10:11]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -732,29 +732,29 @@ define void @test5_s_barrier_init_m0(i32 %arg1 ,i32 %arg2) {
define amdgpu_kernel void @test1_s_barrier_join(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test1_s_barrier_join:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GCN-NEXT: s_barrier_join -1
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v2, v0, s[0:1]
+; GCN-NEXT: global_store_b32 v2, v0, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test1_s_barrier_join:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3]
; GLOBAL-ISEL-NEXT: s_barrier_join -1
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -772,29 +772,29 @@ entry:
define amdgpu_kernel void @test2_s_barrier_join(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test2_s_barrier_join:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GCN-NEXT: s_barrier_join 1
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v2, v0, s[0:1]
+; GCN-NEXT: global_store_b32 v2, v0, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test2_s_barrier_join:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3]
; GLOBAL-ISEL-NEXT: s_barrier_join 1
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -812,29 +812,29 @@ entry:
define amdgpu_kernel void @test3_s_barrier_join(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test3_s_barrier_join:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GCN-NEXT: s_barrier_join 0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v2, v0, s[0:1]
+; GCN-NEXT: global_store_b32 v2, v0, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test3_s_barrier_join:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3]
; GLOBAL-ISEL-NEXT: s_barrier_join 0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -924,41 +924,41 @@ define void @test5_s_barrier_join_m0(i32 %arg) {
define amdgpu_kernel void @test1_s_barrier_leave(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 {
; GCN-LABEL: test1_s_barrier_leave:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GCN-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: global_store_b32 v0, v1, s[10:11]
; GCN-NEXT: s_barrier_leave
-; GCN-NEXT: s_cselect_b32 s3, s3, s5
-; GCN-NEXT: s_cselect_b32 s2, s2, s4
+; GCN-NEXT: s_cselect_b32 s1, s7, s9
+; GCN-NEXT: s_cselect_b32 s0, s6, s8
; GCN-NEXT: s_clause 0x1
-; GCN-NEXT: global_load_b32 v2, v1, s[0:1]
-; GCN-NEXT: global_load_b32 v1, v1, s[2:3]
+; GCN-NEXT: global_load_b32 v2, v1, s[4:5]
+; GCN-NEXT: global_load_b32 v1, v1, s[0:1]
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: v_mul_lo_u32 v1, v1, v2
-; GCN-NEXT: global_store_b32 v0, v1, s[6:7]
+; GCN-NEXT: global_store_b32 v0, v1, s[10:11]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test1_s_barrier_leave:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[10:11]
; GLOBAL-ISEL-NEXT: s_barrier_leave
-; GLOBAL-ISEL-NEXT: s_cselect_b32 s8, 1, 0
+; GLOBAL-ISEL-NEXT: s_cselect_b32 s0, 1, 0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GLOBAL-ISEL-NEXT: s_and_b32 s8, s8, 1
-; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s8, 0
-; GLOBAL-ISEL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GLOBAL-ISEL-NEXT: s_and_b32 s0, s0, 1
+; GLOBAL-ISEL-NEXT: s_cmp_lg_u32 s0, 0
+; GLOBAL-ISEL-NEXT: s_cselect_b64 s[0:1], s[6:7], s[8:9]
; GLOBAL-ISEL-NEXT: s_clause 0x1
-; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[0:1]
-; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[2:3]
+; GLOBAL-ISEL-NEXT: global_load_b32 v2, v1, s[4:5]
+; GLOBAL-ISEL-NEXT: global_load_b32 v1, v1, s[0:1]
; GLOBAL-ISEL-NEXT: s_wait_loadcnt 0x0
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v1, v2
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[6:7]
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[10:11]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -978,29 +978,29 @@ entry:
define amdgpu_kernel void @test1_s_wakeup_barrier(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test1_s_wakeup_barrier:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GCN-NEXT: s_wakeup_barrier -1
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v2, v0, s[0:1]
+; GCN-NEXT: global_store_b32 v2, v0, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test1_s_wakeup_barrier:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3]
; GLOBAL-ISEL-NEXT: s_wakeup_barrier -1
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -1018,29 +1018,29 @@ entry:
define amdgpu_kernel void @test2_s_wakeup_barrier(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test2_s_wakeup_barrier:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GCN-NEXT: s_wakeup_barrier 1
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v2, v0, s[0:1]
+; GCN-NEXT: global_store_b32 v2, v0, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test2_s_wakeup_barrier:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3]
; GLOBAL-ISEL-NEXT: s_wakeup_barrier 1
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -1058,29 +1058,29 @@ entry:
define amdgpu_kernel void @test3_s_wakeup_barrier(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test3_s_wakeup_barrier:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
; GCN-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GCN-NEXT: s_wakeup_barrier 0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v2, v0, s[0:1]
+; GCN-NEXT: global_store_b32 v2, v0, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test3_s_wakeup_barrier:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3]
; GLOBAL-ISEL-NEXT: s_wakeup_barrier 0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -1170,27 +1170,27 @@ define void @test5_s_wakeup_barrier_m0(i32 %arg) {
define amdgpu_kernel void @test1_s_get_barrier_state(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test1_s_get_barrier_state:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_get_barrier_state s2, -1
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_get_barrier_state s4, -1
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_lshlrev_b32 v0, 2, v0
-; GCN-NEXT: global_store_b32 v0, v1, s[0:1]
+; GCN-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v0, 2, v0
+; GCN-NEXT: global_store_b32 v0, v1, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test1_s_get_barrier_state:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1]
-; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, -1
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[2:3]
+; GLOBAL-ISEL-NEXT: s_get_barrier_state s0, -1
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
-; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s0
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -1206,27 +1206,27 @@ entry:
define amdgpu_kernel void @test2_s_get_barrier_state(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test2_s_get_barrier_state:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_get_barrier_state s2, 1
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_get_barrier_state s4, 1
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_lshlrev_b32 v0, 2, v0
-; GCN-NEXT: global_store_b32 v0, v1, s[0:1]
+; GCN-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v0, 2, v0
+; GCN-NEXT: global_store_b32 v0, v1, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test2_s_get_barrier_state:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1]
-; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, 1
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[2:3]
+; GLOBAL-ISEL-NEXT: s_get_barrier_state s0, 1
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
-; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s0
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -1242,27 +1242,27 @@ entry:
define amdgpu_kernel void @test3_s_get_barrier_state(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test3_s_get_barrier_state:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_get_barrier_state s2, 0
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_get_barrier_state s4, 0
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_lshlrev_b32 v0, 2, v0
-; GCN-NEXT: global_store_b32 v0, v1, s[0:1]
+; GCN-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v0, 2, v0
+; GCN-NEXT: global_store_b32 v0, v1, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test3_s_get_barrier_state:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1]
-; GLOBAL-ISEL-NEXT: s_get_barrier_state s2, 0
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[2:3]
+; GLOBAL-ISEL-NEXT: s_get_barrier_state s0, 0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
-; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s2
-; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GLOBAL-ISEL-NEXT: v_mov_b32_e32 v1, s0
+; GLOBAL-ISEL-NEXT: global_store_b32 v0, v1, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
@@ -1352,34 +1352,34 @@ define i32 @test5_s_get_barrier_state_m0(i32 %arg) {
define amdgpu_kernel void @test_barrier_convert(ptr addrspace(1) %out) #0 {
; GCN-LABEL: test_barrier_convert:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mul_u32_u24_e32 v1, v0, v0
; GCN-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GCN-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_store_b32 v3, v2, s[0:1]
+; GCN-NEXT: global_store_b32 v3, v2, s[2:3]
; GCN-NEXT: s_wait_storecnt 0x0
; GCN-NEXT: s_barrier_signal -1
; GCN-NEXT: s_barrier_wait -1
-; GCN-NEXT: global_store_b32 v3, v0, s[0:1]
+; GCN-NEXT: global_store_b32 v3, v0, s[2:3]
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GCN-NEXT: s_endpgm
;
; GLOBAL-ISEL-LABEL: test_barrier_convert:
; GLOBAL-ISEL: ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GLOBAL-ISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GLOBAL-ISEL-NEXT: v_mul_lo_u32 v1, v0, v0
; GLOBAL-ISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
; GLOBAL-ISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GLOBAL-ISEL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
; GLOBAL-ISEL-NEXT: s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v2, s[2:3]
; GLOBAL-ISEL-NEXT: s_wait_storecnt 0x0
; GLOBAL-ISEL-NEXT: s_barrier_signal -1
; GLOBAL-ISEL-NEXT: s_barrier_wait -1
-; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[0:1]
+; GLOBAL-ISEL-NEXT: global_store_b32 v3, v0, s[2:3]
; GLOBAL-ISEL-NEXT: s_nop 0
; GLOBAL-ISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GLOBAL-ISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll
index eb30484ea7f19..3883b3acb211d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll
@@ -5,22 +5,22 @@
define amdgpu_kernel void @test_get_doorbell(ptr addrspace(1) %out) {
; GFX11-SDAG-LABEL: test_get_doorbell:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DOORBELL)
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL)
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: test_get_doorbell:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DOORBELL)
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DOORBELL)
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -32,22 +32,22 @@ define amdgpu_kernel void @test_get_doorbell(ptr addrspace(1) %out) {
define amdgpu_kernel void @test_get_ddid(ptr addrspace(1) %out) {
; GFX11-SDAG-LABEL: test_get_ddid:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DDID)
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DDID)
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: test_get_ddid:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DDID)
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_GET_DDID)
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -59,12 +59,12 @@ define amdgpu_kernel void @test_get_ddid(ptr addrspace(1) %out) {
define amdgpu_kernel void @test_get_tma(ptr addrspace(1) %out) {
; GFX11-LABEL: test_get_tma:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TMA)
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_sendmsg_rtn_b64 s[0:1], sendmsg(MSG_RTN_GET_TMA)
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -76,12 +76,12 @@ define amdgpu_kernel void @test_get_tma(ptr addrspace(1) %out) {
define amdgpu_kernel void @test_get_realtime(ptr addrspace(1) %out) {
; GFX11-LABEL: test_get_realtime:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_REALTIME)
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_sendmsg_rtn_b64 s[0:1], sendmsg(MSG_RTN_GET_REALTIME)
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -93,22 +93,22 @@ define amdgpu_kernel void @test_get_realtime(ptr addrspace(1) %out) {
define amdgpu_kernel void @test_savewave(ptr addrspace(1) %out) {
; GFX11-SDAG-LABEL: test_savewave:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_SAVE_WAVE)
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_SAVE_WAVE)
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: test_savewave:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_SAVE_WAVE)
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(MSG_RTN_SAVE_WAVE)
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -120,12 +120,12 @@ define amdgpu_kernel void @test_savewave(ptr addrspace(1) %out) {
define amdgpu_kernel void @test_get_tba(ptr addrspace(1) %out) {
; GFX11-LABEL: test_get_tba:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TBA)
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_sendmsg_rtn_b64 s[0:1], sendmsg(MSG_RTN_GET_TBA)
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -137,22 +137,22 @@ define amdgpu_kernel void @test_get_tba(ptr addrspace(1) %out) {
define amdgpu_kernel void @test_get_0_i32(ptr addrspace(1) %out) {
; GFX11-SDAG-LABEL: test_get_0_i32:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(0, 0, 0)
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(0, 0, 0)
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: test_get_0_i32:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(0, 0, 0)
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s0, sendmsg(0, 0, 0)
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -164,12 +164,12 @@ define amdgpu_kernel void @test_get_0_i32(ptr addrspace(1) %out) {
define amdgpu_kernel void @test_get_99999_i64(ptr addrspace(1) %out) {
; GFX11-LABEL: test_get_99999_i64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], 99999
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_sendmsg_rtn_b64 s[0:1], 99999
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
index 8f8994e78fd06..2c5efd3e2a54d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll
@@ -5,16 +5,16 @@
define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) {
; GCN-LABEL: set_inactive:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 42
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
store i32 %tmp, ptr addrspace(1) %out
@@ -24,12 +24,12 @@ define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) {
define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) {
; GCN-LABEL: set_inactive_imm_poison:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: v_mov_b32_e32 v0, 1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 1, i32 poison) #0
store i32 %tmp, ptr addrspace(1) %out
@@ -39,19 +39,19 @@ define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) {
define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) {
; GCN-LABEL: set_inactive_64:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: s_mov_b32 s4, s0
-; GCN-NEXT: s_mov_b32 s5, s1
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b32 s0, s4
+; GCN-NEXT: s_mov_b32 s1, s5
+; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
%tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0
store i64 %tmp, ptr addrspace(1) %out
@@ -61,13 +61,13 @@ define amdgpu_kernel void @set_inactive_64(ptr addrspace(1) %out, i64 %in) {
define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) {
; GCN-LABEL: set_inactive_imm_poison_64:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 1
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 1, i64 poison) #0
store i64 %tmp, ptr addrspace(1) %out
@@ -81,30 +81,30 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_buffer_load_dword s3, s[4:7], 0x0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 42
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_lg_u32 s3, 56
-; GCN-NEXT: s_mov_b64 s[2:3], -1
+; GCN-NEXT: s_mov_b64 s[0:1], -1
; GCN-NEXT: s_cbranch_scc1 .LBB4_3
; GCN-NEXT: ; %bb.1: ; %Flow
-; GCN-NEXT: s_andn2_b64 vcc, exec, s[2:3]
+; GCN-NEXT: s_andn2_b64 vcc, exec, s[0:1]
; GCN-NEXT: s_cbranch_vccz .LBB4_4
; GCN-NEXT: .LBB4_2: ; %.exit
; GCN-NEXT: s_endpgm
; GCN-NEXT: .LBB4_3: ; %.one
; GCN-NEXT: v_add_u32_e32 v1, vcc, 1, v0
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: buffer_store_dword v1, off, s[4:7], 0
; GCN-NEXT: s_cbranch_execnz .LBB4_2
; GCN-NEXT: .LBB4_4: ; %.zero
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%val = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %desc, i32 0, i32 0)
%cmp = icmp eq i32 %val, 56
@@ -127,17 +127,17 @@ define amdgpu_kernel void @set_inactive_scc(ptr addrspace(1) %out, i32 %in, <4 x
define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
; GCN-LABEL: set_inactive_f32:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s5, 0x40400000
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s0, 0x40400000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, s5
+; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call float @llvm.amdgcn.set.inactive.f32(float %in, float 3.0) #0
store float %tmp, ptr addrspace(1) %out
@@ -147,21 +147,21 @@ define amdgpu_kernel void @set_inactive_f32(ptr addrspace(1) %out, float %in) {
define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
; GCN-LABEL: set_inactive_f64:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s4, s0
-; GCN-NEXT: s_mov_b32 s5, s1
-; GCN-NEXT: s_mov_b32 s0, 0xcccccccd
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: s_mov_b32 s1, 0x4010cccc
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: s_mov_b32 s0, s4
+; GCN-NEXT: s_mov_b32 s1, s5
+; GCN-NEXT: s_mov_b32 s4, 0xcccccccd
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b32 s5, 0x4010cccc
+; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
%tmp = call double @llvm.amdgcn.set.inactive.f64(double %in, double 4.2) #0
store double %tmp, ptr addrspace(1) %out
@@ -171,17 +171,17 @@ define amdgpu_kernel void @set_inactive_f64(ptr addrspace(1) %out, double %in) {
define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %in) {
; GCN-LABEL: set_inactive_v2i16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s5, 0x10001
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s0, 0x10001
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, s5
+; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call <2 x i16> @llvm.amdgcn.set.inactive.v2i16(<2 x i16> %in, <2 x i16> <i16 1, i16 1>) #0
store <2 x i16> %tmp, ptr addrspace(1) %out
@@ -191,17 +191,17 @@ define amdgpu_kernel void @set_inactive_v2i16(ptr addrspace(1) %out, <2 x i16> %
define amdgpu_kernel void @set_inactive_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
; GCN-LABEL: set_inactive_v2f16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s5, 0x3c003c00
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s0, 0x3c003c00
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, s5
+; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call <2 x half> @llvm.amdgcn.set.inactive.v2f16(<2 x half> %in, <2 x half> <half 1.0, half 1.0>) #0
store <2 x half> %tmp, ptr addrspace(1) %out
@@ -259,17 +259,17 @@ define amdgpu_kernel void @set_inactive_v2f32(ptr addrspace(1) %out, <2 x float>
define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in) {
; GCN-LABEL: set_inactive_v2bf16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
-; GCN-NEXT: s_mov_b32 s5, 0x3f803f80
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_mov_b32 s0, 0x3f803f80
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: v_mov_b32_e32 v0, s5
+; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> <bfloat 1.0, bfloat 1.0>) #0
store <2 x bfloat> %tmp, ptr addrspace(1) %out
@@ -351,19 +351,19 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa
define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
; GCN-LABEL: set_inactive_p0:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: s_mov_b32 s4, s0
-; GCN-NEXT: s_mov_b32 s5, s1
-; GCN-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NEXT: s_mov_b32 s0, s4
+; GCN-NEXT: s_mov_b32 s1, s5
+; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
%tmp = call ptr @llvm.amdgcn.set.inactive.p0(ptr %in, ptr null) #0
store ptr %tmp, ptr addrspace(1) %out
@@ -373,16 +373,16 @@ define amdgpu_kernel void @set_inactive_p0(ptr addrspace(1) %out, ptr %in) {
define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(2) %in) {
; GCN-LABEL: set_inactive_p2:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call ptr addrspace(2) @llvm.amdgcn.set.inactive.p2(ptr addrspace(2) %in, ptr addrspace(2) null) #0
store ptr addrspace(2) %tmp, ptr addrspace(1) %out
@@ -392,16 +392,16 @@ define amdgpu_kernel void @set_inactive_p2(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(3) %in) {
; GCN-LABEL: set_inactive_p3:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call ptr addrspace(3) @llvm.amdgcn.set.inactive.p3(ptr addrspace(3) %in, ptr addrspace(3) null) #0
store ptr addrspace(3) %tmp, ptr addrspace(1) %out
@@ -411,16 +411,16 @@ define amdgpu_kernel void @set_inactive_p3(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(5) %in) {
; GCN-LABEL: set_inactive_p5:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call ptr addrspace(5) @llvm.amdgcn.set.inactive.p5(ptr addrspace(5) %in, ptr addrspace(5) null) #0
store ptr addrspace(5) %tmp, ptr addrspace(1) %out
@@ -430,16 +430,16 @@ define amdgpu_kernel void @set_inactive_p5(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @set_inactive_p6(ptr addrspace(1) %out, ptr addrspace(6) %in) {
; GCN-LABEL: set_inactive_p6:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: s_not_b64 exec, exec
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_not_b64 exec, exec
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
%tmp = call ptr addrspace(6) @llvm.amdgcn.set.inactive.p6(ptr addrspace(6) %in, ptr addrspace(6) null) #0
store ptr addrspace(6) %tmp, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
index 87c5f5bd7b784..7bcafeae82832 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
@@ -147,12 +147,12 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(ptr addrspace(1) %out,
;
; VI-LABEL: bfe_u32_arg_0_width_reg_offset:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 0)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -172,12 +172,12 @@ define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(ptr addrspace(1) %out,
;
; VI-LABEL: bfe_u32_arg_0_width_imm_offset:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 8, i32 0)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -204,19 +204,19 @@ define amdgpu_kernel void @bfe_u32_zextload_i8(ptr addrspace(1) %out, ptr addrsp
;
; VI-LABEL: bfe_u32_zextload_i8:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%load = load i8, ptr addrspace(1) %in
%ext = zext i8 %load to i32
@@ -248,21 +248,21 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(ptr addrspace(1) %out, ptr add
;
; VI-LABEL: bfe_u32_zext_in_reg_i8:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0
; VI-NEXT: v_and_b32_e32 v0, 0xff, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%load = load i32, ptr addrspace(1) %in, align 4
%add = add i32 %load, 1
@@ -294,21 +294,21 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(ptr addrspace(1) %out, ptr ad
;
; VI-LABEL: bfe_u32_zext_in_reg_i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0
; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%load = load i32, ptr addrspace(1) %in, align 4
%add = add i32 %load, 1
@@ -341,22 +341,22 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(ptr addrspace(1) %out
;
; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0
; VI-NEXT: v_and_b32_e32 v0, 0xfe, v0
; VI-NEXT: v_bfe_u32 v0, v0, 1, 8
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%load = load i32, ptr addrspace(1) %in, align 4
%add = add i32 %load, 1
@@ -389,22 +389,22 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(ptr addrspace(1) %out
;
; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_3:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0
; VI-NEXT: v_and_b32_e32 v0, 0xf8, v0
; VI-NEXT: v_bfe_u32 v0, v0, 3, 8
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%load = load i32, ptr addrspace(1) %in, align 4
%add = add i32 %load, 1
@@ -437,22 +437,22 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(ptr addrspace(1) %out
;
; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_7:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0
; VI-NEXT: v_and_b32_e32 v0, 0x80, v0
; VI-NEXT: v_bfe_u32 v0, v0, 7, 8
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%load = load i32, ptr addrspace(1) %in, align 4
%add = add i32 %load, 1
@@ -484,21 +484,21 @@ define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(ptr addrspace(1) %ou
;
; VI-LABEL: bfe_u32_zext_in_reg_i16_offset_8:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0
; VI-NEXT: v_bfe_u32 v0, v0, 8, 8
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%load = load i32, ptr addrspace(1) %in, align 4
%add = add i32 %load, 1
@@ -529,20 +529,20 @@ define amdgpu_kernel void @bfe_u32_test_1(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: bfe_u32_test_1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_and_b32_e32 v0, 1, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 0, i32 1)
@@ -563,12 +563,12 @@ define amdgpu_kernel void @bfe_u32_test_2(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: bfe_u32_test_2:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%shl = shl i32 %x, 31
@@ -590,12 +590,12 @@ define amdgpu_kernel void @bfe_u32_test_3(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: bfe_u32_test_3:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%shl = shl i32 %x, 31
@@ -617,12 +617,12 @@ define amdgpu_kernel void @bfe_u32_test_4(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: bfe_u32_test_4:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%shl = shl i32 %x, 31
@@ -653,20 +653,20 @@ define amdgpu_kernel void @bfe_u32_test_5(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: bfe_u32_test_5:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_bfe_i32 v0, v0, 0, 1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%shl = shl i32 %x, 31
@@ -698,21 +698,21 @@ define amdgpu_kernel void @bfe_u32_test_6(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: bfe_u32_test_6:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v0, 30, v0
; VI-NEXT: v_and_b32_e32 v0, 2.0, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%shl = shl i32 %x, 31
@@ -742,20 +742,20 @@ define amdgpu_kernel void @bfe_u32_test_7(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: bfe_u32_test_7:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v0, 31, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%shl = shl i32 %x, 31
@@ -785,20 +785,20 @@ define amdgpu_kernel void @bfe_u32_test_8(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: bfe_u32_test_8:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_and_b32_e32 v0, 1, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%shl = shl i32 %x, 31
@@ -828,20 +828,20 @@ define amdgpu_kernel void @bfe_u32_test_9(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: bfe_u32_test_9:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v0, 31, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 31, i32 1)
@@ -870,20 +870,20 @@ define amdgpu_kernel void @bfe_u32_test_10(ptr addrspace(1) %out, ptr addrspace(
;
; VI-LABEL: bfe_u32_test_10:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v0, 1, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 1, i32 31)
@@ -912,20 +912,20 @@ define amdgpu_kernel void @bfe_u32_test_11(ptr addrspace(1) %out, ptr addrspace(
;
; VI-LABEL: bfe_u32_test_11:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 8, i32 24)
@@ -954,20 +954,20 @@ define amdgpu_kernel void @bfe_u32_test_12(ptr addrspace(1) %out, ptr addrspace(
;
; VI-LABEL: bfe_u32_test_12:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v0, 24, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 24, i32 8)
@@ -997,20 +997,20 @@ define amdgpu_kernel void @bfe_u32_test_13(ptr addrspace(1) %out, ptr addrspace(
;
; VI-LABEL: bfe_u32_test_13:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v0, 31, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%shl = ashr i32 %x, 31
@@ -1031,12 +1031,12 @@ define amdgpu_kernel void @bfe_u32_test_14(ptr addrspace(1) %out, ptr addrspace(
;
; VI-LABEL: bfe_u32_test_14:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%x = load i32, ptr addrspace(1) %in, align 4
%shl = lshr i32 %x, 31
@@ -1057,12 +1057,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_0(ptr addrspace(1) %out) #
;
; VI-LABEL: bfe_u32_constant_fold_test_0:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 0, i32 0, i32 0)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1082,12 +1082,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_1(ptr addrspace(1) %out) #
;
; VI-LABEL: bfe_u32_constant_fold_test_1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 12334, i32 0, i32 0)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1107,12 +1107,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_2(ptr addrspace(1) %out) #
;
; VI-LABEL: bfe_u32_constant_fold_test_2:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 0, i32 0, i32 1)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1132,12 +1132,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_3(ptr addrspace(1) %out) #
;
; VI-LABEL: bfe_u32_constant_fold_test_3:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 1, i32 0, i32 1)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1157,12 +1157,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_4(ptr addrspace(1) %out) #
;
; VI-LABEL: bfe_u32_constant_fold_test_4:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 4294967295, i32 0, i32 1)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1182,12 +1182,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_5(ptr addrspace(1) %out) #
;
; VI-LABEL: bfe_u32_constant_fold_test_5:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 128, i32 7, i32 1)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1207,12 +1207,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_6(ptr addrspace(1) %out) #
;
; VI-LABEL: bfe_u32_constant_fold_test_6:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0x80
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 128, i32 0, i32 8)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1232,12 +1232,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_7(ptr addrspace(1) %out) #
;
; VI-LABEL: bfe_u32_constant_fold_test_7:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0x7f
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 127, i32 0, i32 8)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1257,12 +1257,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_8(ptr addrspace(1) %out) #
;
; VI-LABEL: bfe_u32_constant_fold_test_8:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 127, i32 6, i32 8)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1282,12 +1282,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_9(ptr addrspace(1) %out) #
;
; VI-LABEL: bfe_u32_constant_fold_test_9:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 65536, i32 16, i32 8)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1307,12 +1307,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_10(ptr addrspace(1) %out)
;
; VI-LABEL: bfe_u32_constant_fold_test_10:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 65535, i32 16, i32 16)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1332,12 +1332,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_11(ptr addrspace(1) %out)
;
; VI-LABEL: bfe_u32_constant_fold_test_11:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 10
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 4, i32 4)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1357,12 +1357,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_12(ptr addrspace(1) %out)
;
; VI-LABEL: bfe_u32_constant_fold_test_12:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 31, i32 1)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1382,12 +1382,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_13(ptr addrspace(1) %out)
;
; VI-LABEL: bfe_u32_constant_fold_test_13:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 131070, i32 16, i32 16)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1407,12 +1407,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_14(ptr addrspace(1) %out)
;
; VI-LABEL: bfe_u32_constant_fold_test_14:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 40
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 2, i32 30)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1432,12 +1432,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_15(ptr addrspace(1) %out)
;
; VI-LABEL: bfe_u32_constant_fold_test_15:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 10
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 4, i32 28)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1457,12 +1457,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_16(ptr addrspace(1) %out)
;
; VI-LABEL: bfe_u32_constant_fold_test_16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0x7f
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 4294967295, i32 1, i32 7)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1482,12 +1482,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_17(ptr addrspace(1) %out)
;
; VI-LABEL: bfe_u32_constant_fold_test_17:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0x7f
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 255, i32 1, i32 31)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1507,12 +1507,12 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_18(ptr addrspace(1) %out)
;
; VI-LABEL: bfe_u32_constant_fold_test_18:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 255, i32 31, i32 1)
store i32 %bfe_u32, ptr addrspace(1) %out, align 4
@@ -1593,14 +1593,14 @@ define amdgpu_kernel void @lshr_and(ptr addrspace(1) %out, i32 %a) #0 {
;
; VI-LABEL: lshr_and:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bfe_u32 s4, s4, 0x30006
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_bfe_u32 s0, s2, 0x30006
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%b = lshr i32 %a, 6
%c = and i32 %b, 7
@@ -1657,14 +1657,14 @@ define amdgpu_kernel void @and_lshr(ptr addrspace(1) %out, i32 %a) #0 {
;
; VI-LABEL: and_lshr:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bfe_u32 s4, s4, 0x30006
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_bfe_u32 s0, s2, 0x30006
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%b = and i32 %a, 448
%c = lshr i32 %b, 6
@@ -1687,14 +1687,14 @@ define amdgpu_kernel void @and_lshr2(ptr addrspace(1) %out, i32 %a) #0 {
;
; VI-LABEL: and_lshr2:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bfe_u32 s4, s4, 0x30006
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_bfe_u32 s0, s2, 0x30006
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%b = and i32 %a, 511
%c = lshr i32 %b, 6
@@ -1717,14 +1717,14 @@ define amdgpu_kernel void @shl_lshr(ptr addrspace(1) %out, i32 %a) #0 {
;
; VI-LABEL: shl_lshr:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bfe_u32 s4, s4, 0x150002
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_bfe_u32 s0, s2, 0x150002
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%b = shl i32 %a, 9
%c = lshr i32 %b, 11
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
index eeddb3d5b8192..7edac873e6437 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
@@ -30,58 +30,58 @@ define amdgpu_kernel void @ceil_f16(
;
; VI-LABEL: ceil_f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ceil_f16_e32 v0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: ceil_f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_ceil_f16_e32 v0.l, v0.l
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: ceil_f16:
; GFX11-FAKE16: ; %bb.0: ; %entry
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
-; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
-; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s3
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
-; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
-; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s7
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, s4
; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, s5
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_ceil_f16_e32 v0, v0
-; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-FAKE16-NEXT: s_nop 0
; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-NEXT: s_endpgm
@@ -130,37 +130,37 @@ define amdgpu_kernel void @ceil_v2f16(
;
; VI-LABEL: ceil_v2f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ceil_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: v_ceil_f16_e32 v0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: ceil_v2f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_ceil_f16_e32 v0.l, v0.l
@@ -172,31 +172,31 @@ define amdgpu_kernel void @ceil_v2f16(
; GFX11-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: ceil_v2f16:
; GFX11-FAKE16: ; %bb.0: ; %entry
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
-; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
-; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s3
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
-; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
-; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s7
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, s4
; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, s5
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-FAKE16-NEXT: v_ceil_f16_e32 v0, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_ceil_f16_e32 v1, v1
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-FAKE16-NEXT: s_nop 0
; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
index fcc4cb3436fd7..28d3e8f7c37c9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
@@ -30,55 +30,55 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
;
; GFX8-LABEL: cos_f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
; GFX8-NEXT: v_fract_f16_e32 v0, v0
; GFX8-NEXT: v_cos_f16_e32 v2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: cos_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
; GFX9-NEXT: v_cos_f16_e32 v1, v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: cos_f16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
; GFX10-NEXT: v_cos_f16_e32 v1, v1
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: cos_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cos_f16_e32 v1, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -121,10 +121,10 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
;
; GFX8-LABEL: cos_v2f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v1, 0x3118
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -134,50 +134,50 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX8-NEXT: v_fract_f16_e32 v0, v0
; GFX8-NEXT: v_cos_f16_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; GFX8-NEXT: v_cos_f16_e32 v3, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: cos_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0x3118
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_f16_e32 v3, 0.15915494, v1
; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_cos_f16_e32 v2, v3
; GFX9-NEXT: v_cos_f16_e32 v1, v1
; GFX9-NEXT: v_pack_b32_f16 v1, v2, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: cos_v2f16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v2, 0x3118
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_f16_e32 v3, 0.15915494, v1
; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_cos_f16_e32 v2, v3
; GFX10-NEXT: v_cos_f16_e32 v1, v1
; GFX10-NEXT: v_pack_b32_f16 v1, v2, v1
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: cos_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
@@ -188,7 +188,7 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX11-NEXT: v_cos_f16_e32 v2, v2
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_pack_b32_f16 v1, v1, v2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index 4f65acda2a210..d60e07dfe554a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -12,84 +12,86 @@
define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) {
; VI-SDAG-LABEL: s_exp_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_and_b32 s3, s2, 0xfffff000
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_sub_f32_e32 v1, s2, v1
+; VI-SDAG-NEXT: s_and_b32 s2, s4, 0xfffff000
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; VI-SDAG-NEXT: v_sub_f32_e32 v1, s4, v1
; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x39a3b295, v1
; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1
-; VI-SDAG-NEXT: v_mul_f32_e32 v0, s3, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0
; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v3
; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x39a3b295
; VI-SDAG-NEXT: v_rndne_f32_e32 v2, v0
-; VI-SDAG-NEXT: v_mul_f32_e32 v3, s3, v3
+; VI-SDAG-NEXT: v_mul_f32_e32 v3, s2, v3
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2
; VI-SDAG-NEXT: v_add_f32_e32 v1, v3, v1
; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1
; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0
; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v2
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x7f800000
; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0
-; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v1
+; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v1
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42b17218
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v1
+; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v1
; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: s_exp_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8a000
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x39a3b295
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_and_b32 s3, s2, 0xfffff000
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s3
-; VI-GISEL-NEXT: v_sub_f32_e32 v2, s2, v2
+; VI-GISEL-NEXT: s_and_b32 s2, s4, 0xfffff000
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_sub_f32_e32 v2, s4, v2
; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x39a3b295, v2
; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, s3, v0
+; VI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0
; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
-; VI-GISEL-NEXT: v_mul_f32_e32 v1, s3, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v1, s2, v1
; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
; VI-GISEL-NEXT: v_rndne_f32_e32 v2, v0
; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2
; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2
; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000
; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0
-; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v1
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42b17218
; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1
+; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1
; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
; GFX900-SDAG-LABEL: s_exp_f32:
; GFX900-SDAG: ; %bb.0:
-; GFX900-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX900-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b
; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f
-; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s4, v0
; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2
-; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v0, -v2
+; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v0, -v2
; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3
-; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v1, v0
+; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0
; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v2, v0
; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v3
; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0
@@ -97,39 +99,39 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) {
; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0
; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc2ce8ed0
-; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v1
+; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v1
; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x42b17218
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v1
+; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v1
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[2:3]
; GFX900-SDAG-NEXT: s_endpgm
;
; GFX900-GISEL-LABEL: s_exp_f32:
; GFX900-GISEL: ; %bb.0:
-; GFX900-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX900-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x32a5705f
-; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0
-; GFX900-GISEL-NEXT: v_fma_f32 v0, s2, v0, -v2
+; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s4, v0
+; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v2
; GFX900-GISEL-NEXT: v_rndne_f32_e32 v3, v2
-; GFX900-GISEL-NEXT: v_fma_f32 v0, s2, v1, v0
+; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v1, v0
; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v2, v3
; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v1, v0
; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v3
; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0xc2ce8ed0
-; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2
; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000
; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42b17218
; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1
+; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX900-GISEL-NEXT: s_endpgm
;
; SI-SDAG-LABEL: s_exp_f32:
@@ -853,7 +855,6 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: s_and_b32 s2, s6, 0xfffff000
; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
@@ -870,16 +871,17 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2
; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1
; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3
-; VI-SDAG-NEXT: s_and_b32 s2, s5, 0xfffff000
-; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_and_b32 s0, s5, 0xfffff000
+; VI-SDAG-NEXT: v_mov_b32_e32 v7, s0
; VI-SDAG-NEXT: v_sub_f32_e32 v7, s5, v7
; VI-SDAG-NEXT: v_ldexp_f32 v1, v1, v2
-; VI-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, s0, v0
; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x39a3b295, v7
; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3fb8a000, v7
; VI-SDAG-NEXT: v_rndne_f32_e32 v6, v2
; VI-SDAG-NEXT: v_add_f32_e32 v7, v7, v8
-; VI-SDAG-NEXT: v_mul_f32_e32 v8, s2, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v8, s0, v4
; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v6
; VI-SDAG-NEXT: v_add_f32_e32 v7, v8, v7
; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v7
@@ -891,17 +893,17 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; VI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v5
-; VI-SDAG-NEXT: s_and_b32 s2, s4, 0xfffff000
+; VI-SDAG-NEXT: s_and_b32 s0, s4, 0xfffff000
; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc
; VI-SDAG-NEXT: v_ldexp_f32 v1, v7, v6
-; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v7, s0
; VI-SDAG-NEXT: v_sub_f32_e32 v7, s4, v7
-; VI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0
; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x39a3b295, v7
; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3fb8a000, v7
; VI-SDAG-NEXT: v_rndne_f32_e32 v6, v0
; VI-SDAG-NEXT: v_add_f32_e32 v7, v7, v9
-; VI-SDAG-NEXT: v_mul_f32_e32 v4, s2, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v4, s0, v4
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v6
; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v7
; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v4
@@ -915,9 +917,10 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v3
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v5
-; VI-SDAG-NEXT: v_mov_b32_e32 v4, s1
+; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, s3
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s2
; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-SDAG-NEXT: s_endpgm
;
@@ -926,7 +929,6 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3fb8a000
; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x39a3b295
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: s_and_b32 s2, s4, 0xfffff000
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
@@ -936,14 +938,15 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; VI-GISEL-NEXT: v_mul_f32_e32 v3, s2, v1
; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v4
; VI-GISEL-NEXT: v_mul_f32_e32 v4, s2, v2
-; VI-GISEL-NEXT: s_and_b32 s2, s5, 0xfffff000
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s2
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_and_b32 s0, s5, 0xfffff000
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s0
; VI-GISEL-NEXT: v_sub_f32_e32 v5, s5, v5
; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x39a3b295, v5
; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x3fb8a000, v5
-; VI-GISEL-NEXT: v_mul_f32_e32 v6, s2, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v6, s0, v1
; VI-GISEL-NEXT: v_add_f32_e32 v5, v5, v7
-; VI-GISEL-NEXT: v_mul_f32_e32 v7, s2, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v7, s0, v2
; VI-GISEL-NEXT: v_add_f32_e32 v5, v7, v5
; VI-GISEL-NEXT: v_rndne_f32_e32 v7, v6
; VI-GISEL-NEXT: v_sub_f32_e32 v6, v6, v7
@@ -952,19 +955,19 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; VI-GISEL-NEXT: v_exp_f32_e32 v5, v5
; VI-GISEL-NEXT: v_add_f32_e32 v0, v4, v0
; VI-GISEL-NEXT: v_rndne_f32_e32 v4, v3
-; VI-GISEL-NEXT: s_and_b32 s2, s6, 0xfffff000
+; VI-GISEL-NEXT: s_and_b32 s0, s6, 0xfffff000
; VI-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4
; VI-GISEL-NEXT: v_ldexp_f32 v5, v5, v6
-; VI-GISEL-NEXT: v_mov_b32_e32 v6, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v6, s0
; VI-GISEL-NEXT: v_add_f32_e32 v0, v3, v0
; VI-GISEL-NEXT: v_sub_f32_e32 v6, s6, v6
; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v3, v4
; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x39a3b295, v6
; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3fb8a000, v6
-; VI-GISEL-NEXT: v_mul_f32_e32 v1, s2, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v1, s0, v1
; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v8
-; VI-GISEL-NEXT: v_mul_f32_e32 v2, s2, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v2, s0, v2
; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v6
; VI-GISEL-NEXT: v_rndne_f32_e32 v6, v1
; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v6
@@ -987,19 +990,20 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v3
; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v4
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s1
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s3
; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s2
; VI-GISEL-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-GISEL-NEXT: s_endpgm
;
; GFX900-SDAG-LABEL: s_exp_v3f32:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b
; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f
; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0x42b17218
-; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s5, v0
; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v6
@@ -1043,15 +1047,15 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v5
; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
-; GFX900-SDAG-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
+; GFX900-SDAG-NEXT: global_store_dwordx3 v4, v[0:2], s[2:3]
; GFX900-SDAG-NEXT: s_endpgm
;
; GFX900-GISEL-LABEL: s_exp_v3f32:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x3fb8aa3b
; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x32a5705f
-; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s5, v1
; GFX900-GISEL-NEXT: v_fma_f32 v6, s5, v1, -v5
@@ -1096,7 +1100,7 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v3
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc
; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0
-; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
; GFX900-GISEL-NEXT: s_endpgm
;
; SI-SDAG-LABEL: s_exp_v3f32:
@@ -1593,7 +1597,6 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8a000
; VI-SDAG-NEXT: v_mov_b32_e32 v6, 0x42b17218
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: s_and_b32 s2, s7, 0xfffff000
; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
@@ -1610,37 +1613,38 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2
; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1
; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3
-; VI-SDAG-NEXT: s_and_b32 s2, s6, 0xfffff000
-; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_and_b32 s0, s6, 0xfffff000
+; VI-SDAG-NEXT: v_mov_b32_e32 v7, s0
; VI-SDAG-NEXT: v_sub_f32_e32 v7, s6, v7
; VI-SDAG-NEXT: v_ldexp_f32 v1, v1, v2
-; VI-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, s0, v0
; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x39a3b295, v7
; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x3fb8a000, v7
; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v2
; VI-SDAG-NEXT: v_add_f32_e32 v7, v7, v8
-; VI-SDAG-NEXT: v_mul_f32_e32 v8, s2, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v8, s0, v4
; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3
; VI-SDAG-NEXT: v_add_f32_e32 v7, v8, v7
; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v7
; VI-SDAG-NEXT: v_exp_f32_e32 v2, v2
; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v7, v3
; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0
-; VI-SDAG-NEXT: s_and_b32 s2, s5, 0xfffff000
+; VI-SDAG-NEXT: s_and_b32 s0, s5, 0xfffff000
; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5
-; VI-SDAG-NEXT: v_mov_b32_e32 v9, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v9, s0
; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; VI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s7, v6
; VI-SDAG-NEXT: v_sub_f32_e32 v9, s5, v9
; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v8, v1, vcc
; VI-SDAG-NEXT: v_ldexp_f32 v1, v2, v7
-; VI-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, s0, v0
; VI-SDAG-NEXT: v_mul_f32_e32 v10, 0x39a3b295, v9
; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x3fb8a000, v9
; VI-SDAG-NEXT: v_rndne_f32_e32 v7, v2
; VI-SDAG-NEXT: v_add_f32_e32 v9, v9, v10
-; VI-SDAG-NEXT: v_mul_f32_e32 v10, s2, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v10, s0, v4
; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v7
; VI-SDAG-NEXT: v_add_f32_e32 v9, v10, v9
; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v9
@@ -1649,17 +1653,17 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v5
; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v6
-; VI-SDAG-NEXT: s_and_b32 s2, s4, 0xfffff000
+; VI-SDAG-NEXT: s_and_b32 s0, s4, 0xfffff000
; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc
; VI-SDAG-NEXT: v_ldexp_f32 v1, v9, v7
-; VI-SDAG-NEXT: v_mov_b32_e32 v9, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v9, s0
; VI-SDAG-NEXT: v_sub_f32_e32 v9, s4, v9
-; VI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0
; VI-SDAG-NEXT: v_mul_f32_e32 v10, 0x39a3b295, v9
; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x3fb8a000, v9
; VI-SDAG-NEXT: v_rndne_f32_e32 v7, v0
; VI-SDAG-NEXT: v_add_f32_e32 v9, v9, v10
-; VI-SDAG-NEXT: v_mul_f32_e32 v4, s2, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v4, s0, v4
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v7
; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v9
; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v4
@@ -1673,9 +1677,10 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v5
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v6
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s1
+; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s3
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v4, s0
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, s2
; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-SDAG-NEXT: s_endpgm
;
@@ -1685,7 +1690,6 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3fb8a000
; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x39a3b295
; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x42b17218
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: s_and_b32 s2, s4, 0xfffff000
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
@@ -1701,49 +1705,50 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; VI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0
; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v4
; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; VI-GISEL-NEXT: s_and_b32 s2, s5, 0xfffff000
-; VI-GISEL-NEXT: v_mul_f32_e32 v6, s2, v2
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc2ce8ed0
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_and_b32 s0, s5, 0xfffff000
+; VI-GISEL-NEXT: v_mul_f32_e32 v6, s0, v2
; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s0
; VI-GISEL-NEXT: v_sub_f32_e32 v1, s5, v1
; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x39a3b295, v1
; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1
; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v7
-; VI-GISEL-NEXT: v_mul_f32_e32 v7, s2, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v7, s0, v3
; VI-GISEL-NEXT: v_add_f32_e32 v1, v7, v1
; VI-GISEL-NEXT: v_rndne_f32_e32 v7, v6
; VI-GISEL-NEXT: v_sub_f32_e32 v6, v6, v7
; VI-GISEL-NEXT: v_add_f32_e32 v1, v6, v1
; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v6, v7
; VI-GISEL-NEXT: v_exp_f32_e32 v1, v1
-; VI-GISEL-NEXT: s_and_b32 s2, s6, 0xfffff000
-; VI-GISEL-NEXT: v_mul_f32_e32 v8, s2, v2
-; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4
+; VI-GISEL-NEXT: s_and_b32 s0, s6, 0xfffff000
+; VI-GISEL-NEXT: v_mul_f32_e32 v8, s0, v2
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc2ce8ed0
; VI-GISEL-NEXT: v_ldexp_f32 v1, v1, v6
-; VI-GISEL-NEXT: v_mov_b32_e32 v6, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v6, s0
; VI-GISEL-NEXT: v_sub_f32_e32 v6, s6, v6
; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x39a3b295, v6
; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x3fb8a000, v6
; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v9
-; VI-GISEL-NEXT: v_mul_f32_e32 v9, s2, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v9, s0, v3
; VI-GISEL-NEXT: v_add_f32_e32 v6, v9, v6
; VI-GISEL-NEXT: v_rndne_f32_e32 v9, v8
; VI-GISEL-NEXT: v_sub_f32_e32 v8, v8, v9
; VI-GISEL-NEXT: v_add_f32_e32 v6, v8, v6
; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v8, v9
; VI-GISEL-NEXT: v_exp_f32_e32 v6, v6
-; VI-GISEL-NEXT: s_and_b32 s2, s7, 0xfffff000
-; VI-GISEL-NEXT: v_mul_f32_e32 v2, s2, v2
-; VI-GISEL-NEXT: v_mul_f32_e32 v3, s2, v3
+; VI-GISEL-NEXT: s_and_b32 s0, s7, 0xfffff000
+; VI-GISEL-NEXT: v_mul_f32_e32 v2, s0, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, s0, v3
; VI-GISEL-NEXT: v_ldexp_f32 v6, v6, v8
-; VI-GISEL-NEXT: v_mov_b32_e32 v8, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v8, s0
; VI-GISEL-NEXT: v_sub_f32_e32 v8, s7, v8
; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x39a3b295, v8
; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3fb8a000, v8
; VI-GISEL-NEXT: v_add_f32_e32 v8, v8, v9
; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v8
; VI-GISEL-NEXT: v_rndne_f32_e32 v8, v2
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4
; VI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v8
; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; VI-GISEL-NEXT: v_mov_b32_e32 v7, 0x7f800000
@@ -1764,19 +1769,20 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v4
; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v5
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s1
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s3
; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s2
; VI-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-GISEL-NEXT: s_endpgm
;
; GFX900-SDAG-LABEL: s_exp_v4f32:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x3fb8aa3b
; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x32a5705f
; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0xc2ce8ed0
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x42b17218
; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0
; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2
@@ -1787,8 +1793,8 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3
; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2
; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x42b17218
; GFX900-SDAG-NEXT: v_mov_b32_e32 v9, 0x7f800000
-; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-SDAG-NEXT: v_ldexp_f32 v2, v2, v3
; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s6, v0
; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v3
@@ -1833,17 +1839,16 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v6
; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc
-; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX900-SDAG-NEXT: s_endpgm
;
; GFX900-GISEL-LABEL: s_exp_v4f32:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x3fb8aa3b
; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x32a5705f
; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x42b17218
-; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v2
; GFX900-GISEL-NEXT: v_fma_f32 v1, s4, v2, -v0
@@ -1900,7 +1905,7 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v5
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0
-; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX900-GISEL-NEXT: s_endpgm
;
; SI-SDAG-LABEL: s_exp_v4f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index ff20f90f05ca0..bd167dda017bd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -14,84 +14,86 @@
define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) {
; VI-SDAG-LABEL: s_exp10_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: s_and_b32 s3, s2, 0xfffff000
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_sub_f32_e32 v1, s2, v1
+; VI-SDAG-NEXT: s_and_b32 s2, s4, 0xfffff000
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; VI-SDAG-NEXT: v_sub_f32_e32 v1, s4, v1
; VI-SDAG-NEXT: v_mul_f32_e32 v3, 0x3a2784bc, v1
; VI-SDAG-NEXT: v_mul_f32_e32 v1, 0x40549000, v1
-; VI-SDAG-NEXT: v_mul_f32_e32 v0, s3, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0
; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v3
; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x3a2784bc
; VI-SDAG-NEXT: v_rndne_f32_e32 v2, v0
-; VI-SDAG-NEXT: v_mul_f32_e32 v3, s3, v3
+; VI-SDAG-NEXT: v_mul_f32_e32 v3, s2, v3
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v2
; VI-SDAG-NEXT: v_add_f32_e32 v1, v3, v1
; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v1
; VI-SDAG-NEXT: v_exp_f32_e32 v0, v0
; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v2
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x7f800000
; VI-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0xc23369f4
-; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v1
+; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v1
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x421a209b
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v1
+; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v1
; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: s_exp10_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549000
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x3a2784bc
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: s_and_b32 s3, s2, 0xfffff000
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s3
-; VI-GISEL-NEXT: v_sub_f32_e32 v2, s2, v2
+; VI-GISEL-NEXT: s_and_b32 s2, s4, 0xfffff000
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; VI-GISEL-NEXT: v_sub_f32_e32 v2, s4, v2
; VI-GISEL-NEXT: v_mul_f32_e32 v3, 0x3a2784bc, v2
; VI-GISEL-NEXT: v_mul_f32_e32 v2, 0x40549000, v2
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, s3, v0
+; VI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0
; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
-; VI-GISEL-NEXT: v_mul_f32_e32 v1, s3, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v1, s2, v1
; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
; VI-GISEL-NEXT: v_rndne_f32_e32 v2, v0
; VI-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2
; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v1
; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v2
; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000
; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0xc23369f4
-; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v1
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x421a209b
; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1
+; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1
; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
; GFX900-SDAG-LABEL: s_exp10_f32:
; GFX900-SDAG: ; %bb.0:
-; GFX900-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX900-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78
; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37
-; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0
+; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s4, v0
; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2
-; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v0, -v2
+; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v0, -v2
; GFX900-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3
-; GFX900-SDAG-NEXT: v_fma_f32 v0, s2, v1, v0
+; GFX900-SDAG-NEXT: v_fma_f32 v0, s4, v1, v0
; GFX900-SDAG-NEXT: v_add_f32_e32 v0, v2, v0
; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v1, v3
; GFX900-SDAG-NEXT: v_exp_f32_e32 v0, v0
@@ -99,39 +101,39 @@ define amdgpu_kernel void @s_exp10_f32(ptr addrspace(1) %out, float %in) {
; GFX900-SDAG-NEXT: v_mov_b32_e32 v2, 0
; GFX900-SDAG-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0xc23369f4
-; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v1
+; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v1
; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x421a209b
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v1
+; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v1
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX900-SDAG-NEXT: global_store_dword v2, v0, s[2:3]
; GFX900-SDAG-NEXT: s_endpgm
;
; GFX900-GISEL-LABEL: s_exp10_f32:
; GFX900-GISEL: ; %bb.0:
-; GFX900-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX900-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x40549a78
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x33979a37
-; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s2, v0
-; GFX900-GISEL-NEXT: v_fma_f32 v0, s2, v0, -v2
+; GFX900-GISEL-NEXT: v_mul_f32_e32 v2, s4, v0
+; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v0, -v2
; GFX900-GISEL-NEXT: v_rndne_f32_e32 v3, v2
-; GFX900-GISEL-NEXT: v_fma_f32 v0, s2, v1, v0
+; GFX900-GISEL-NEXT: v_fma_f32 v0, s4, v1, v0
; GFX900-GISEL-NEXT: v_sub_f32_e32 v1, v2, v3
; GFX900-GISEL-NEXT: v_add_f32_e32 v0, v1, v0
; GFX900-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v3
; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0xc23369f4
-; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v2
+; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v2
; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000
; GFX900-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x421a209b
; GFX900-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s2, v1
+; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX900-GISEL-NEXT: s_endpgm
;
; SI-SDAG-LABEL: s_exp10_f32:
@@ -855,7 +857,6 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: s_and_b32 s2, s6, 0xfffff000
; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
@@ -872,16 +873,17 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2
; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1
; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3
-; VI-SDAG-NEXT: s_and_b32 s2, s5, 0xfffff000
-; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_and_b32 s0, s5, 0xfffff000
+; VI-SDAG-NEXT: v_mov_b32_e32 v7, s0
; VI-SDAG-NEXT: v_sub_f32_e32 v7, s5, v7
; VI-SDAG-NEXT: v_ldexp_f32 v1, v1, v2
-; VI-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, s0, v0
; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3a2784bc, v7
; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x40549000, v7
; VI-SDAG-NEXT: v_rndne_f32_e32 v6, v2
; VI-SDAG-NEXT: v_add_f32_e32 v7, v7, v8
-; VI-SDAG-NEXT: v_mul_f32_e32 v8, s2, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v8, s0, v4
; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v6
; VI-SDAG-NEXT: v_add_f32_e32 v7, v8, v7
; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v7
@@ -893,17 +895,17 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; VI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v5
-; VI-SDAG-NEXT: s_and_b32 s2, s4, 0xfffff000
+; VI-SDAG-NEXT: s_and_b32 s0, s4, 0xfffff000
; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc
; VI-SDAG-NEXT: v_ldexp_f32 v1, v7, v6
-; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v7, s0
; VI-SDAG-NEXT: v_sub_f32_e32 v7, s4, v7
-; VI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0
; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x3a2784bc, v7
; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x40549000, v7
; VI-SDAG-NEXT: v_rndne_f32_e32 v6, v0
; VI-SDAG-NEXT: v_add_f32_e32 v7, v7, v9
-; VI-SDAG-NEXT: v_mul_f32_e32 v4, s2, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v4, s0, v4
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v6
; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v7
; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v4
@@ -917,9 +919,10 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v3
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v5
-; VI-SDAG-NEXT: v_mov_b32_e32 v4, s1
+; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, s3
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s2
; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-SDAG-NEXT: s_endpgm
;
@@ -928,7 +931,6 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x40549000
; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3a2784bc
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: s_and_b32 s2, s4, 0xfffff000
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
@@ -938,14 +940,15 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-GISEL-NEXT: v_mul_f32_e32 v3, s2, v1
; VI-GISEL-NEXT: v_add_f32_e32 v0, v0, v4
; VI-GISEL-NEXT: v_mul_f32_e32 v4, s2, v2
-; VI-GISEL-NEXT: s_and_b32 s2, s5, 0xfffff000
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s2
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_and_b32 s0, s5, 0xfffff000
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s0
; VI-GISEL-NEXT: v_sub_f32_e32 v5, s5, v5
; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3a2784bc, v5
; VI-GISEL-NEXT: v_mul_f32_e32 v5, 0x40549000, v5
-; VI-GISEL-NEXT: v_mul_f32_e32 v6, s2, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v6, s0, v1
; VI-GISEL-NEXT: v_add_f32_e32 v5, v5, v7
-; VI-GISEL-NEXT: v_mul_f32_e32 v7, s2, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v7, s0, v2
; VI-GISEL-NEXT: v_add_f32_e32 v5, v7, v5
; VI-GISEL-NEXT: v_rndne_f32_e32 v7, v6
; VI-GISEL-NEXT: v_sub_f32_e32 v6, v6, v7
@@ -954,19 +957,19 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-GISEL-NEXT: v_exp_f32_e32 v5, v5
; VI-GISEL-NEXT: v_add_f32_e32 v0, v4, v0
; VI-GISEL-NEXT: v_rndne_f32_e32 v4, v3
-; VI-GISEL-NEXT: s_and_b32 s2, s6, 0xfffff000
+; VI-GISEL-NEXT: s_and_b32 s0, s6, 0xfffff000
; VI-GISEL-NEXT: v_sub_f32_e32 v3, v3, v4
; VI-GISEL-NEXT: v_ldexp_f32 v5, v5, v6
-; VI-GISEL-NEXT: v_mov_b32_e32 v6, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v6, s0
; VI-GISEL-NEXT: v_add_f32_e32 v0, v3, v0
; VI-GISEL-NEXT: v_sub_f32_e32 v6, s6, v6
; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v3, v4
; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x3a2784bc, v6
; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x40549000, v6
-; VI-GISEL-NEXT: v_mul_f32_e32 v1, s2, v1
+; VI-GISEL-NEXT: v_mul_f32_e32 v1, s0, v1
; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v8
-; VI-GISEL-NEXT: v_mul_f32_e32 v2, s2, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v2, s0, v2
; VI-GISEL-NEXT: v_add_f32_e32 v2, v2, v6
; VI-GISEL-NEXT: v_rndne_f32_e32 v6, v1
; VI-GISEL-NEXT: v_sub_f32_e32 v1, v1, v6
@@ -989,19 +992,20 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s6, v3
; VI-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v4
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s1
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s3
; VI-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v3, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v3, s2
; VI-GISEL-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-GISEL-NEXT: s_endpgm
;
; GFX900-SDAG-LABEL: s_exp10_v3f32:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78
; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37
; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0x421a209b
-; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-SDAG-NEXT: v_mul_f32_e32 v6, s5, v0
; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v6
@@ -1045,15 +1049,15 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v5
; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
-; GFX900-SDAG-NEXT: global_store_dwordx3 v4, v[0:2], s[0:1]
+; GFX900-SDAG-NEXT: global_store_dwordx3 v4, v[0:2], s[2:3]
; GFX900-SDAG-NEXT: s_endpgm
;
; GFX900-GISEL-LABEL: s_exp10_v3f32:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x40549a78
; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x33979a37
-; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-GISEL-NEXT: v_mul_f32_e32 v5, s5, v1
; GFX900-GISEL-NEXT: v_fma_f32 v6, s5, v1, -v5
@@ -1098,7 +1102,7 @@ define amdgpu_kernel void @s_exp10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s6, v3
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v7, vcc
; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0
-; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX900-GISEL-NEXT: global_store_dwordx3 v3, v[0:2], s[2:3]
; GFX900-GISEL-NEXT: s_endpgm
;
; SI-SDAG-LABEL: s_exp10_v3f32:
@@ -1595,7 +1599,6 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549000
; VI-SDAG-NEXT: v_mov_b32_e32 v6, 0x421a209b
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; VI-SDAG-NEXT: s_and_b32 s2, s7, 0xfffff000
; VI-SDAG-NEXT: v_mov_b32_e32 v2, s2
@@ -1612,37 +1615,38 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-SDAG-NEXT: v_add_f32_e32 v1, v1, v2
; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1
; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v2, v3
-; VI-SDAG-NEXT: s_and_b32 s2, s6, 0xfffff000
-; VI-SDAG-NEXT: v_mov_b32_e32 v7, s2
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_and_b32 s0, s6, 0xfffff000
+; VI-SDAG-NEXT: v_mov_b32_e32 v7, s0
; VI-SDAG-NEXT: v_sub_f32_e32 v7, s6, v7
; VI-SDAG-NEXT: v_ldexp_f32 v1, v1, v2
-; VI-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, s0, v0
; VI-SDAG-NEXT: v_mul_f32_e32 v8, 0x3a2784bc, v7
; VI-SDAG-NEXT: v_mul_f32_e32 v7, 0x40549000, v7
; VI-SDAG-NEXT: v_rndne_f32_e32 v3, v2
; VI-SDAG-NEXT: v_add_f32_e32 v7, v7, v8
-; VI-SDAG-NEXT: v_mul_f32_e32 v8, s2, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v8, s0, v4
; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v3
; VI-SDAG-NEXT: v_add_f32_e32 v7, v8, v7
; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v7
; VI-SDAG-NEXT: v_exp_f32_e32 v2, v2
; VI-SDAG-NEXT: v_cvt_i32_f32_e32 v7, v3
; VI-SDAG-NEXT: v_mov_b32_e32 v5, 0xc23369f4
-; VI-SDAG-NEXT: s_and_b32 s2, s5, 0xfffff000
+; VI-SDAG-NEXT: s_and_b32 s0, s5, 0xfffff000
; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5
-; VI-SDAG-NEXT: v_mov_b32_e32 v9, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v9, s0
; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; VI-SDAG-NEXT: v_mov_b32_e32 v8, 0x7f800000
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s7, v6
; VI-SDAG-NEXT: v_sub_f32_e32 v9, s5, v9
; VI-SDAG-NEXT: v_cndmask_b32_e32 v3, v8, v1, vcc
; VI-SDAG-NEXT: v_ldexp_f32 v1, v2, v7
-; VI-SDAG-NEXT: v_mul_f32_e32 v2, s2, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v2, s0, v0
; VI-SDAG-NEXT: v_mul_f32_e32 v10, 0x3a2784bc, v9
; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x40549000, v9
; VI-SDAG-NEXT: v_rndne_f32_e32 v7, v2
; VI-SDAG-NEXT: v_add_f32_e32 v9, v9, v10
-; VI-SDAG-NEXT: v_mul_f32_e32 v10, s2, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v10, s0, v4
; VI-SDAG-NEXT: v_sub_f32_e32 v2, v2, v7
; VI-SDAG-NEXT: v_add_f32_e32 v9, v10, v9
; VI-SDAG-NEXT: v_add_f32_e32 v2, v2, v9
@@ -1651,17 +1655,17 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s6, v5
; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s6, v6
-; VI-SDAG-NEXT: s_and_b32 s2, s4, 0xfffff000
+; VI-SDAG-NEXT: s_and_b32 s0, s4, 0xfffff000
; VI-SDAG-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc
; VI-SDAG-NEXT: v_ldexp_f32 v1, v9, v7
-; VI-SDAG-NEXT: v_mov_b32_e32 v9, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v9, s0
; VI-SDAG-NEXT: v_sub_f32_e32 v9, s4, v9
-; VI-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0
+; VI-SDAG-NEXT: v_mul_f32_e32 v0, s0, v0
; VI-SDAG-NEXT: v_mul_f32_e32 v10, 0x3a2784bc, v9
; VI-SDAG-NEXT: v_mul_f32_e32 v9, 0x40549000, v9
; VI-SDAG-NEXT: v_rndne_f32_e32 v7, v0
; VI-SDAG-NEXT: v_add_f32_e32 v9, v9, v10
-; VI-SDAG-NEXT: v_mul_f32_e32 v4, s2, v4
+; VI-SDAG-NEXT: v_mul_f32_e32 v4, s0, v4
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v0, v7
; VI-SDAG-NEXT: v_add_f32_e32 v4, v4, v9
; VI-SDAG-NEXT: v_add_f32_e32 v0, v0, v4
@@ -1675,9 +1679,10 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v5
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; VI-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v6
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s1
+; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s3
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
-; VI-SDAG-NEXT: v_mov_b32_e32 v4, s0
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, s2
; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-SDAG-NEXT: s_endpgm
;
@@ -1687,7 +1692,6 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x40549000
; VI-GISEL-NEXT: v_mov_b32_e32 v3, 0x3a2784bc
; VI-GISEL-NEXT: v_mov_b32_e32 v5, 0x421a209b
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: s_and_b32 s2, s4, 0xfffff000
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
@@ -1703,49 +1707,50 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-GISEL-NEXT: v_add_f32_e32 v0, v1, v0
; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v1, v4
; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; VI-GISEL-NEXT: s_and_b32 s2, s5, 0xfffff000
-; VI-GISEL-NEXT: v_mul_f32_e32 v6, s2, v2
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc23369f4
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_and_b32 s0, s5, 0xfffff000
+; VI-GISEL-NEXT: v_mul_f32_e32 v6, s0, v2
; VI-GISEL-NEXT: v_ldexp_f32 v0, v0, v1
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s0
; VI-GISEL-NEXT: v_sub_f32_e32 v1, s5, v1
; VI-GISEL-NEXT: v_mul_f32_e32 v7, 0x3a2784bc, v1
; VI-GISEL-NEXT: v_mul_f32_e32 v1, 0x40549000, v1
; VI-GISEL-NEXT: v_add_f32_e32 v1, v1, v7
-; VI-GISEL-NEXT: v_mul_f32_e32 v7, s2, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v7, s0, v3
; VI-GISEL-NEXT: v_add_f32_e32 v1, v7, v1
; VI-GISEL-NEXT: v_rndne_f32_e32 v7, v6
; VI-GISEL-NEXT: v_sub_f32_e32 v6, v6, v7
; VI-GISEL-NEXT: v_add_f32_e32 v1, v6, v1
; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v6, v7
; VI-GISEL-NEXT: v_exp_f32_e32 v1, v1
-; VI-GISEL-NEXT: s_and_b32 s2, s6, 0xfffff000
-; VI-GISEL-NEXT: v_mul_f32_e32 v8, s2, v2
-; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4
+; VI-GISEL-NEXT: s_and_b32 s0, s6, 0xfffff000
+; VI-GISEL-NEXT: v_mul_f32_e32 v8, s0, v2
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xc23369f4
; VI-GISEL-NEXT: v_ldexp_f32 v1, v1, v6
-; VI-GISEL-NEXT: v_mov_b32_e32 v6, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v6, s0
; VI-GISEL-NEXT: v_sub_f32_e32 v6, s6, v6
; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x3a2784bc, v6
; VI-GISEL-NEXT: v_mul_f32_e32 v6, 0x40549000, v6
; VI-GISEL-NEXT: v_add_f32_e32 v6, v6, v9
-; VI-GISEL-NEXT: v_mul_f32_e32 v9, s2, v3
+; VI-GISEL-NEXT: v_mul_f32_e32 v9, s0, v3
; VI-GISEL-NEXT: v_add_f32_e32 v6, v9, v6
; VI-GISEL-NEXT: v_rndne_f32_e32 v9, v8
; VI-GISEL-NEXT: v_sub_f32_e32 v8, v8, v9
; VI-GISEL-NEXT: v_add_f32_e32 v6, v8, v6
; VI-GISEL-NEXT: v_cvt_i32_f32_e32 v8, v9
; VI-GISEL-NEXT: v_exp_f32_e32 v6, v6
-; VI-GISEL-NEXT: s_and_b32 s2, s7, 0xfffff000
-; VI-GISEL-NEXT: v_mul_f32_e32 v2, s2, v2
-; VI-GISEL-NEXT: v_mul_f32_e32 v3, s2, v3
+; VI-GISEL-NEXT: s_and_b32 s0, s7, 0xfffff000
+; VI-GISEL-NEXT: v_mul_f32_e32 v2, s0, v2
+; VI-GISEL-NEXT: v_mul_f32_e32 v3, s0, v3
; VI-GISEL-NEXT: v_ldexp_f32 v6, v6, v8
-; VI-GISEL-NEXT: v_mov_b32_e32 v8, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v8, s0
; VI-GISEL-NEXT: v_sub_f32_e32 v8, s7, v8
; VI-GISEL-NEXT: v_mul_f32_e32 v9, 0x3a2784bc, v8
; VI-GISEL-NEXT: v_mul_f32_e32 v8, 0x40549000, v8
; VI-GISEL-NEXT: v_add_f32_e32 v8, v8, v9
; VI-GISEL-NEXT: v_add_f32_e32 v3, v3, v8
; VI-GISEL-NEXT: v_rndne_f32_e32 v8, v2
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v4
; VI-GISEL-NEXT: v_sub_f32_e32 v2, v2, v8
; VI-GISEL-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; VI-GISEL-NEXT: v_mov_b32_e32 v7, 0x7f800000
@@ -1766,19 +1771,20 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s7, v4
; VI-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
; VI-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v5
-; VI-GISEL-NEXT: v_mov_b32_e32 v5, s1
+; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; VI-GISEL-NEXT: v_mov_b32_e32 v5, s3
; VI-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
-; VI-GISEL-NEXT: v_mov_b32_e32 v4, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v4, s2
; VI-GISEL-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-GISEL-NEXT: s_endpgm
;
; GFX900-SDAG-LABEL: s_exp10_v4f32:
; GFX900-SDAG: ; %bb.0:
; GFX900-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX900-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-SDAG-NEXT: v_mov_b32_e32 v0, 0x40549a78
; GFX900-SDAG-NEXT: v_mov_b32_e32 v1, 0x33979a37
; GFX900-SDAG-NEXT: v_mov_b32_e32 v5, 0xc23369f4
-; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x421a209b
; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-SDAG-NEXT: v_mul_f32_e32 v2, s7, v0
; GFX900-SDAG-NEXT: v_rndne_f32_e32 v3, v2
@@ -1789,8 +1795,8 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; GFX900-SDAG-NEXT: v_cvt_i32_f32_e32 v3, v3
; GFX900-SDAG-NEXT: v_exp_f32_e32 v2, v2
; GFX900-SDAG-NEXT: v_cmp_nlt_f32_e32 vcc, s7, v5
+; GFX900-SDAG-NEXT: v_mov_b32_e32 v6, 0x421a209b
; GFX900-SDAG-NEXT: v_mov_b32_e32 v9, 0x7f800000
-; GFX900-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-SDAG-NEXT: v_ldexp_f32 v2, v2, v3
; GFX900-SDAG-NEXT: v_mul_f32_e32 v3, s6, v0
; GFX900-SDAG-NEXT: v_rndne_f32_e32 v7, v3
@@ -1835,17 +1841,16 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; GFX900-SDAG-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v6
; GFX900-SDAG-NEXT: v_mov_b32_e32 v4, 0
; GFX900-SDAG-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc
-; GFX900-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX900-SDAG-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX900-SDAG-NEXT: s_endpgm
;
; GFX900-GISEL-LABEL: s_exp10_v4f32:
; GFX900-GISEL: ; %bb.0:
; GFX900-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-GISEL-NEXT: v_mov_b32_e32 v2, 0x40549a78
; GFX900-GISEL-NEXT: v_mov_b32_e32 v3, 0x33979a37
; GFX900-GISEL-NEXT: v_mov_b32_e32 v5, 0x421a209b
-; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s4, v2
; GFX900-GISEL-NEXT: v_fma_f32 v1, s4, v2, -v0
@@ -1902,7 +1907,7 @@ define amdgpu_kernel void @s_exp10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; GFX900-GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s7, v5
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
; GFX900-GISEL-NEXT: v_mov_b32_e32 v4, 0
-; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX900-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3]
; GFX900-GISEL-NEXT: s_endpgm
;
; SI-SDAG-LABEL: s_exp10_v4f32:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index 06fa910366584..197aa0735886c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -50,39 +50,39 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) {
;
; VI-SDAG-LABEL: s_exp2_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
+; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42800000
; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; VI-SDAG-NEXT: v_add_f32_e32 v1, s2, v1
+; VI-SDAG-NEXT: v_add_f32_e32 v1, s4, v1
; VI-SDAG-NEXT: v_exp_f32_e32 v1, v1
; VI-SDAG-NEXT: v_mul_f32_e32 v2, v1, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: s_exp2_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
-; VI-GISEL-NEXT: v_add_f32_e32 v0, s2, v0
+; VI-GISEL-NEXT: v_add_f32_e32 v0, s4, v0
; VI-GISEL-NEXT: v_exp_f32_e32 v0, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000
; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
; VI-GISEL-NEXT: v_mul_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
@@ -109,17 +109,18 @@ define amdgpu_kernel void @s_exp2_f32(ptr addrspace(1) %out, float %in) {
; GFX900-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42800000
-; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
; GFX900-GISEL-NEXT: v_add_f32_e32 v0, s2, v0
; GFX900-GISEL-NEXT: v_exp_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x1f800000
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX900-GISEL-NEXT: s_endpgm
;
; R600-LABEL: s_exp2_f32:
@@ -445,7 +446,7 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-SDAG-LABEL: s_exp2_v3f32:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000
; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000
@@ -467,9 +468,9 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-SDAG-NEXT: v_exp_f32_e32 v6, v6
; VI-SDAG-NEXT: v_mul_f32_e32 v2, v4, v2
; VI-SDAG-NEXT: v_mul_f32_e32 v0, v3, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v4, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, s3
; VI-SDAG-NEXT: v_mul_f32_e32 v1, v6, v5
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s2
; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-SDAG-NEXT: s_endpgm
;
@@ -730,7 +731,7 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-SDAG-LABEL: s_exp2_v4f32:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0xc2fc0000
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x1f800000
; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x42800000
@@ -757,10 +758,10 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-SDAG-NEXT: v_exp_f32_e32 v9, v1
; VI-SDAG-NEXT: v_mul_f32_e32 v3, v4, v2
; VI-SDAG-NEXT: v_mul_f32_e32 v2, v6, v5
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s3
; VI-SDAG-NEXT: v_mul_f32_e32 v1, v8, v7
; VI-SDAG-NEXT: v_mul_f32_e32 v0, v9, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v4, s0
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, s2
; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-SDAG-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
index e8d037c5ff53e..fca039888593f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
@@ -30,58 +30,58 @@ define amdgpu_kernel void @floor_f16(
;
; VI-LABEL: floor_f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_floor_f16_e32 v0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: floor_f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_floor_f16_e32 v0.l, v0.l
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: floor_f16:
; GFX11-FAKE16: ; %bb.0: ; %entry
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
-; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
-; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s3
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
-; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
-; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s7
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, s4
; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, s5
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_floor_f16_e32 v0, v0
-; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-FAKE16-NEXT: s_nop 0
; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-NEXT: s_endpgm
@@ -131,37 +131,37 @@ define amdgpu_kernel void @floor_v2f16(
;
; VI-LABEL: floor_v2f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_floor_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: v_floor_f16_e32 v0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: floor_v2f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_floor_f16_e32 v0.l, v0.l
@@ -173,31 +173,31 @@ define amdgpu_kernel void @floor_v2f16(
; GFX11-NEXT: v_mov_b16_e32 v0.l, v0.h
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX11-FAKE16-LABEL: floor_v2f16:
; GFX11-FAKE16: ; %bb.0: ; %entry
-; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1
-; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6
-; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT: s_mov_b32 s10, s2
+; GFX11-FAKE16-NEXT: s_mov_b32 s11, s3
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2
-; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3
-; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT: s_mov_b32 s8, s6
+; GFX11-FAKE16-NEXT: s_mov_b32 s9, s7
+; GFX11-FAKE16-NEXT: s_mov_b32 s0, s4
; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, s5
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-FAKE16-NEXT: v_floor_f16_e32 v0, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_floor_f16_e32 v1, v1
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-FAKE16-NEXT: s_nop 0
; GFX11-FAKE16-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FAKE16-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
index a2e30603b6afc..038ad95a3e472 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
@@ -48,177 +48,177 @@ define amdgpu_kernel void @fmuladd_f16(
;
; VI-FLUSH-LABEL: fmuladd_f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-FLUSH-NEXT: s_mov_b32 s11, 0xf000
-; VI-FLUSH-NEXT: s_mov_b32 s10, -1
-; VI-FLUSH-NEXT: s_mov_b32 s14, s10
-; VI-FLUSH-NEXT: s_mov_b32 s15, s11
+; VI-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_mov_b32 s3, 0xf000
+; VI-FLUSH-NEXT: s_mov_b32 s2, -1
+; VI-FLUSH-NEXT: s_mov_b32 s14, s2
+; VI-FLUSH-NEXT: s_mov_b32 s15, s3
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: s_mov_b32 s12, s2
-; VI-FLUSH-NEXT: s_mov_b32 s13, s3
-; VI-FLUSH-NEXT: s_mov_b32 s16, s4
-; VI-FLUSH-NEXT: s_mov_b32 s17, s5
-; VI-FLUSH-NEXT: s_mov_b32 s18, s10
-; VI-FLUSH-NEXT: s_mov_b32 s19, s11
-; VI-FLUSH-NEXT: s_mov_b32 s4, s6
-; VI-FLUSH-NEXT: s_mov_b32 s5, s7
-; VI-FLUSH-NEXT: s_mov_b32 s6, s10
-; VI-FLUSH-NEXT: s_mov_b32 s7, s11
+; VI-FLUSH-NEXT: s_mov_b32 s12, s6
+; VI-FLUSH-NEXT: s_mov_b32 s13, s7
+; VI-FLUSH-NEXT: s_mov_b32 s16, s8
+; VI-FLUSH-NEXT: s_mov_b32 s17, s9
+; VI-FLUSH-NEXT: s_mov_b32 s18, s2
+; VI-FLUSH-NEXT: s_mov_b32 s19, s3
+; VI-FLUSH-NEXT: s_mov_b32 s8, s10
+; VI-FLUSH-NEXT: s_mov_b32 s9, s11
+; VI-FLUSH-NEXT: s_mov_b32 s10, s2
+; VI-FLUSH-NEXT: s_mov_b32 s11, s3
; VI-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0
; VI-FLUSH-NEXT: buffer_load_ushort v1, off, s[16:19], 0
-; VI-FLUSH-NEXT: buffer_load_ushort v2, off, s[4:7], 0
-; VI-FLUSH-NEXT: s_mov_b32 s8, s0
-; VI-FLUSH-NEXT: s_mov_b32 s9, s1
+; VI-FLUSH-NEXT: buffer_load_ushort v2, off, s[8:11], 0
+; VI-FLUSH-NEXT: s_mov_b32 s0, s4
+; VI-FLUSH-NEXT: s_mov_b32 s1, s5
; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
; VI-FLUSH-NEXT: v_mac_f16_e32 v2, v0, v1
-; VI-FLUSH-NEXT: buffer_store_short v2, off, s[8:11], 0
+; VI-FLUSH-NEXT: buffer_store_short v2, off, s[0:3], 0
; VI-FLUSH-NEXT: s_endpgm
;
; VI-DENORM-LABEL: fmuladd_f16:
; VI-DENORM: ; %bb.0:
-; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-DENORM-NEXT: s_mov_b32 s11, 0xf000
-; VI-DENORM-NEXT: s_mov_b32 s10, -1
-; VI-DENORM-NEXT: s_mov_b32 s14, s10
-; VI-DENORM-NEXT: s_mov_b32 s15, s11
+; VI-DENORM-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-DENORM-NEXT: s_mov_b32 s3, 0xf000
+; VI-DENORM-NEXT: s_mov_b32 s2, -1
+; VI-DENORM-NEXT: s_mov_b32 s14, s2
+; VI-DENORM-NEXT: s_mov_b32 s15, s3
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-NEXT: s_mov_b32 s12, s2
-; VI-DENORM-NEXT: s_mov_b32 s13, s3
-; VI-DENORM-NEXT: s_mov_b32 s16, s4
-; VI-DENORM-NEXT: s_mov_b32 s17, s5
-; VI-DENORM-NEXT: s_mov_b32 s18, s10
-; VI-DENORM-NEXT: s_mov_b32 s19, s11
-; VI-DENORM-NEXT: s_mov_b32 s4, s6
-; VI-DENORM-NEXT: s_mov_b32 s5, s7
-; VI-DENORM-NEXT: s_mov_b32 s6, s10
-; VI-DENORM-NEXT: s_mov_b32 s7, s11
+; VI-DENORM-NEXT: s_mov_b32 s12, s6
+; VI-DENORM-NEXT: s_mov_b32 s13, s7
+; VI-DENORM-NEXT: s_mov_b32 s16, s8
+; VI-DENORM-NEXT: s_mov_b32 s17, s9
+; VI-DENORM-NEXT: s_mov_b32 s18, s2
+; VI-DENORM-NEXT: s_mov_b32 s19, s3
+; VI-DENORM-NEXT: s_mov_b32 s8, s10
+; VI-DENORM-NEXT: s_mov_b32 s9, s11
+; VI-DENORM-NEXT: s_mov_b32 s10, s2
+; VI-DENORM-NEXT: s_mov_b32 s11, s3
; VI-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0
; VI-DENORM-NEXT: buffer_load_ushort v1, off, s[16:19], 0
-; VI-DENORM-NEXT: buffer_load_ushort v2, off, s[4:7], 0
-; VI-DENORM-NEXT: s_mov_b32 s8, s0
-; VI-DENORM-NEXT: s_mov_b32 s9, s1
+; VI-DENORM-NEXT: buffer_load_ushort v2, off, s[8:11], 0
+; VI-DENORM-NEXT: s_mov_b32 s0, s4
+; VI-DENORM-NEXT: s_mov_b32 s1, s5
; VI-DENORM-NEXT: s_waitcnt vmcnt(0)
; VI-DENORM-NEXT: v_fma_f16 v0, v0, v1, v2
-; VI-DENORM-NEXT: buffer_store_short v0, off, s[8:11], 0
+; VI-DENORM-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-DENORM-NEXT: s_endpgm
;
; GFX10-FLUSH-LABEL: fmuladd_f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX10-FLUSH-NEXT: s_mov_b32 s10, -1
-; GFX10-FLUSH-NEXT: s_mov_b32 s11, 0x31016000
-; GFX10-FLUSH-NEXT: s_mov_b32 s14, s10
-; GFX10-FLUSH-NEXT: s_mov_b32 s15, s11
-; GFX10-FLUSH-NEXT: s_mov_b32 s18, s10
-; GFX10-FLUSH-NEXT: s_mov_b32 s19, s11
+; GFX10-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_mov_b32 s2, -1
+; GFX10-FLUSH-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-FLUSH-NEXT: s_mov_b32 s14, s2
+; GFX10-FLUSH-NEXT: s_mov_b32 s15, s3
+; GFX10-FLUSH-NEXT: s_mov_b32 s18, s2
+; GFX10-FLUSH-NEXT: s_mov_b32 s19, s3
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT: s_mov_b32 s12, s2
-; GFX10-FLUSH-NEXT: s_mov_b32 s13, s3
-; GFX10-FLUSH-NEXT: s_mov_b32 s16, s4
-; GFX10-FLUSH-NEXT: s_mov_b32 s17, s5
+; GFX10-FLUSH-NEXT: s_mov_b32 s12, s6
+; GFX10-FLUSH-NEXT: s_mov_b32 s13, s7
+; GFX10-FLUSH-NEXT: s_mov_b32 s16, s8
+; GFX10-FLUSH-NEXT: s_mov_b32 s17, s9
; GFX10-FLUSH-NEXT: buffer_load_ushort v0, off, s[12:15], 0
; GFX10-FLUSH-NEXT: buffer_load_ushort v1, off, s[16:19], 0
-; GFX10-FLUSH-NEXT: s_mov_b32 s4, s6
-; GFX10-FLUSH-NEXT: s_mov_b32 s5, s7
-; GFX10-FLUSH-NEXT: s_mov_b32 s6, s10
-; GFX10-FLUSH-NEXT: s_mov_b32 s7, s11
-; GFX10-FLUSH-NEXT: s_mov_b32 s8, s0
-; GFX10-FLUSH-NEXT: buffer_load_ushort v2, off, s[4:7], 0
-; GFX10-FLUSH-NEXT: s_mov_b32 s9, s1
+; GFX10-FLUSH-NEXT: s_mov_b32 s8, s10
+; GFX10-FLUSH-NEXT: s_mov_b32 s9, s11
+; GFX10-FLUSH-NEXT: s_mov_b32 s10, s2
+; GFX10-FLUSH-NEXT: s_mov_b32 s11, s3
+; GFX10-FLUSH-NEXT: s_mov_b32 s0, s4
+; GFX10-FLUSH-NEXT: buffer_load_ushort v2, off, s[8:11], 0
+; GFX10-FLUSH-NEXT: s_mov_b32 s1, s5
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1)
; GFX10-FLUSH-NEXT: v_mul_f16_e32 v0, v0, v1
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_add_f16_e32 v0, v0, v2
-; GFX10-FLUSH-NEXT: buffer_store_short v0, off, s[8:11], 0
+; GFX10-FLUSH-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-LABEL: fmuladd_f16:
; GFX10-DENORM: ; %bb.0:
-; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX10-DENORM-NEXT: s_mov_b32 s10, -1
-; GFX10-DENORM-NEXT: s_mov_b32 s11, 0x31016000
-; GFX10-DENORM-NEXT: s_mov_b32 s14, s10
-; GFX10-DENORM-NEXT: s_mov_b32 s15, s11
-; GFX10-DENORM-NEXT: s_mov_b32 s18, s10
-; GFX10-DENORM-NEXT: s_mov_b32 s19, s11
-; GFX10-DENORM-NEXT: s_mov_b32 s22, s10
-; GFX10-DENORM-NEXT: s_mov_b32 s23, s11
+; GFX10-DENORM-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX10-DENORM-NEXT: s_mov_b32 s2, -1
+; GFX10-DENORM-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-DENORM-NEXT: s_mov_b32 s14, s2
+; GFX10-DENORM-NEXT: s_mov_b32 s15, s3
+; GFX10-DENORM-NEXT: s_mov_b32 s18, s2
+; GFX10-DENORM-NEXT: s_mov_b32 s19, s3
+; GFX10-DENORM-NEXT: s_mov_b32 s22, s2
+; GFX10-DENORM-NEXT: s_mov_b32 s23, s3
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-NEXT: s_mov_b32 s12, s2
-; GFX10-DENORM-NEXT: s_mov_b32 s13, s3
-; GFX10-DENORM-NEXT: s_mov_b32 s16, s4
-; GFX10-DENORM-NEXT: s_mov_b32 s17, s5
-; GFX10-DENORM-NEXT: s_mov_b32 s20, s6
-; GFX10-DENORM-NEXT: s_mov_b32 s21, s7
+; GFX10-DENORM-NEXT: s_mov_b32 s12, s6
+; GFX10-DENORM-NEXT: s_mov_b32 s13, s7
+; GFX10-DENORM-NEXT: s_mov_b32 s16, s8
+; GFX10-DENORM-NEXT: s_mov_b32 s17, s9
+; GFX10-DENORM-NEXT: s_mov_b32 s20, s10
+; GFX10-DENORM-NEXT: s_mov_b32 s21, s11
; GFX10-DENORM-NEXT: buffer_load_ushort v0, off, s[12:15], 0
; GFX10-DENORM-NEXT: buffer_load_ushort v1, off, s[16:19], 0
; GFX10-DENORM-NEXT: buffer_load_ushort v2, off, s[20:23], 0
-; GFX10-DENORM-NEXT: s_mov_b32 s8, s0
-; GFX10-DENORM-NEXT: s_mov_b32 s9, s1
+; GFX10-DENORM-NEXT: s_mov_b32 s0, s4
+; GFX10-DENORM-NEXT: s_mov_b32 s1, s5
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-NEXT: v_fmac_f16_e32 v2, v0, v1
-; GFX10-DENORM-NEXT: buffer_store_short v2, off, s[8:11], 0
+; GFX10-DENORM-NEXT: buffer_store_short v2, off, s[0:3], 0
; GFX10-DENORM-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fmuladd_f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1
-; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10
-; GFX11-FLUSH-NEXT: s_mov_b32 s15, s11
-; GFX11-FLUSH-NEXT: s_mov_b32 s18, s10
-; GFX11-FLUSH-NEXT: s_mov_b32 s19, s11
+; GFX11-FLUSH-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_mov_b32 s2, -1
+; GFX11-FLUSH-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FLUSH-NEXT: s_mov_b32 s14, s2
+; GFX11-FLUSH-NEXT: s_mov_b32 s15, s3
+; GFX11-FLUSH-NEXT: s_mov_b32 s18, s2
+; GFX11-FLUSH-NEXT: s_mov_b32 s19, s3
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: s_mov_b32 s12, s2
-; GFX11-FLUSH-NEXT: s_mov_b32 s13, s3
-; GFX11-FLUSH-NEXT: s_mov_b32 s16, s4
-; GFX11-FLUSH-NEXT: s_mov_b32 s17, s5
+; GFX11-FLUSH-NEXT: s_mov_b32 s12, s6
+; GFX11-FLUSH-NEXT: s_mov_b32 s13, s7
+; GFX11-FLUSH-NEXT: s_mov_b32 s16, s8
+; GFX11-FLUSH-NEXT: s_mov_b32 s17, s9
; GFX11-FLUSH-NEXT: buffer_load_u16 v0, off, s[12:15], 0
; GFX11-FLUSH-NEXT: buffer_load_u16 v1, off, s[16:19], 0
-; GFX11-FLUSH-NEXT: s_mov_b32 s4, s6
-; GFX11-FLUSH-NEXT: s_mov_b32 s5, s7
-; GFX11-FLUSH-NEXT: s_mov_b32 s6, s10
-; GFX11-FLUSH-NEXT: s_mov_b32 s7, s11
-; GFX11-FLUSH-NEXT: s_mov_b32 s8, s0
-; GFX11-FLUSH-NEXT: buffer_load_u16 v2, off, s[4:7], 0
-; GFX11-FLUSH-NEXT: s_mov_b32 s9, s1
+; GFX11-FLUSH-NEXT: s_mov_b32 s8, s10
+; GFX11-FLUSH-NEXT: s_mov_b32 s9, s11
+; GFX11-FLUSH-NEXT: s_mov_b32 s10, s2
+; GFX11-FLUSH-NEXT: s_mov_b32 s11, s3
+; GFX11-FLUSH-NEXT: s_mov_b32 s0, s4
+; GFX11-FLUSH-NEXT: buffer_load_u16 v2, off, s[8:11], 0
+; GFX11-FLUSH-NEXT: s_mov_b32 s1, s5
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(1)
; GFX11-FLUSH-NEXT: v_mul_f16_e32 v0, v0, v1
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v0, v0, v2
-; GFX11-FLUSH-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-FLUSH-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-LABEL: fmuladd_f16:
; GFX11-DENORM: ; %bb.0:
-; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-DENORM-NEXT: s_mov_b32 s10, -1
-; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-DENORM-NEXT: s_mov_b32 s14, s10
-; GFX11-DENORM-NEXT: s_mov_b32 s15, s11
-; GFX11-DENORM-NEXT: s_mov_b32 s18, s10
-; GFX11-DENORM-NEXT: s_mov_b32 s19, s11
-; GFX11-DENORM-NEXT: s_mov_b32 s22, s10
-; GFX11-DENORM-NEXT: s_mov_b32 s23, s11
+; GFX11-DENORM-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-DENORM-NEXT: s_mov_b32 s2, -1
+; GFX11-DENORM-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-DENORM-NEXT: s_mov_b32 s14, s2
+; GFX11-DENORM-NEXT: s_mov_b32 s15, s3
+; GFX11-DENORM-NEXT: s_mov_b32 s18, s2
+; GFX11-DENORM-NEXT: s_mov_b32 s19, s3
+; GFX11-DENORM-NEXT: s_mov_b32 s22, s2
+; GFX11-DENORM-NEXT: s_mov_b32 s23, s3
; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-NEXT: s_mov_b32 s12, s2
-; GFX11-DENORM-NEXT: s_mov_b32 s13, s3
-; GFX11-DENORM-NEXT: s_mov_b32 s16, s4
-; GFX11-DENORM-NEXT: s_mov_b32 s17, s5
-; GFX11-DENORM-NEXT: s_mov_b32 s20, s6
-; GFX11-DENORM-NEXT: s_mov_b32 s21, s7
+; GFX11-DENORM-NEXT: s_mov_b32 s12, s6
+; GFX11-DENORM-NEXT: s_mov_b32 s13, s7
+; GFX11-DENORM-NEXT: s_mov_b32 s16, s8
+; GFX11-DENORM-NEXT: s_mov_b32 s17, s9
+; GFX11-DENORM-NEXT: s_mov_b32 s20, s10
+; GFX11-DENORM-NEXT: s_mov_b32 s21, s11
; GFX11-DENORM-NEXT: buffer_load_u16 v0, off, s[12:15], 0
; GFX11-DENORM-NEXT: buffer_load_u16 v1, off, s[16:19], 0
; GFX11-DENORM-NEXT: buffer_load_u16 v2, off, s[20:23], 0
-; GFX11-DENORM-NEXT: s_mov_b32 s8, s0
-; GFX11-DENORM-NEXT: s_mov_b32 s9, s1
+; GFX11-DENORM-NEXT: s_mov_b32 s0, s4
+; GFX11-DENORM-NEXT: s_mov_b32 s1, s5
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, v0, v1
-; GFX11-DENORM-NEXT: buffer_store_b16 v2, off, s[8:11], 0
+; GFX11-DENORM-NEXT: buffer_store_b16 v2, off, s[0:3], 0
; GFX11-DENORM-NEXT: s_nop 0
; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-NEXT: s_endpgm
@@ -361,26 +361,26 @@ define amdgpu_kernel void @fmuladd_f16_imm_a(
; GFX11-FLUSH: ; %bb.0:
; GFX11-FLUSH-NEXT: s_clause 0x1
; GFX11-FLUSH-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1
-; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10
-; GFX11-FLUSH-NEXT: s_mov_b32 s15, s11
-; GFX11-FLUSH-NEXT: s_mov_b32 s2, s10
-; GFX11-FLUSH-NEXT: s_mov_b32 s3, s11
+; GFX11-FLUSH-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-FLUSH-NEXT: s_mov_b32 s2, -1
+; GFX11-FLUSH-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FLUSH-NEXT: s_mov_b32 s14, s2
+; GFX11-FLUSH-NEXT: s_mov_b32 s15, s3
+; GFX11-FLUSH-NEXT: s_mov_b32 s10, s2
+; GFX11-FLUSH-NEXT: s_mov_b32 s11, s3
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FLUSH-NEXT: s_mov_b32 s12, s6
; GFX11-FLUSH-NEXT: s_mov_b32 s13, s7
-; GFX11-FLUSH-NEXT: s_mov_b32 s8, s4
+; GFX11-FLUSH-NEXT: s_mov_b32 s0, s4
; GFX11-FLUSH-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-FLUSH-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: s_mov_b32 s9, s5
+; GFX11-FLUSH-NEXT: s_mov_b32 s1, s5
; GFX11-FLUSH-NEXT: v_mul_f16_e32 v0, 0x4200, v0
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v0, v0, v1
-; GFX11-FLUSH-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-FLUSH-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
@@ -389,24 +389,24 @@ define amdgpu_kernel void @fmuladd_f16_imm_a(
; GFX11-DENORM: ; %bb.0:
; GFX11-DENORM-NEXT: s_clause 0x1
; GFX11-DENORM-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-DENORM-NEXT: s_mov_b32 s10, -1
-; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-DENORM-NEXT: s_mov_b32 s14, s10
-; GFX11-DENORM-NEXT: s_mov_b32 s15, s11
-; GFX11-DENORM-NEXT: s_mov_b32 s2, s10
-; GFX11-DENORM-NEXT: s_mov_b32 s3, s11
+; GFX11-DENORM-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-DENORM-NEXT: s_mov_b32 s2, -1
+; GFX11-DENORM-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-DENORM-NEXT: s_mov_b32 s14, s2
+; GFX11-DENORM-NEXT: s_mov_b32 s15, s3
+; GFX11-DENORM-NEXT: s_mov_b32 s10, s2
+; GFX11-DENORM-NEXT: s_mov_b32 s11, s3
; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DENORM-NEXT: s_mov_b32 s12, s6
; GFX11-DENORM-NEXT: s_mov_b32 s13, s7
; GFX11-DENORM-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-DENORM-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: s_mov_b32 s8, s4
-; GFX11-DENORM-NEXT: s_mov_b32 s9, s5
+; GFX11-DENORM-NEXT: s_mov_b32 s0, s4
+; GFX11-DENORM-NEXT: s_mov_b32 s1, s5
; GFX11-DENORM-NEXT: v_fmac_f16_e32 v1, 0x4200, v0
-; GFX11-DENORM-NEXT: buffer_store_b16 v1, off, s[8:11], 0
+; GFX11-DENORM-NEXT: buffer_store_b16 v1, off, s[0:3], 0
; GFX11-DENORM-NEXT: s_nop 0
; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-NEXT: s_endpgm
@@ -547,26 +547,26 @@ define amdgpu_kernel void @fmuladd_f16_imm_b(
; GFX11-FLUSH: ; %bb.0:
; GFX11-FLUSH-NEXT: s_clause 0x1
; GFX11-FLUSH-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FLUSH-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1
-; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10
-; GFX11-FLUSH-NEXT: s_mov_b32 s15, s11
-; GFX11-FLUSH-NEXT: s_mov_b32 s2, s10
-; GFX11-FLUSH-NEXT: s_mov_b32 s3, s11
+; GFX11-FLUSH-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-FLUSH-NEXT: s_mov_b32 s2, -1
+; GFX11-FLUSH-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FLUSH-NEXT: s_mov_b32 s14, s2
+; GFX11-FLUSH-NEXT: s_mov_b32 s15, s3
+; GFX11-FLUSH-NEXT: s_mov_b32 s10, s2
+; GFX11-FLUSH-NEXT: s_mov_b32 s11, s3
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FLUSH-NEXT: s_mov_b32 s12, s6
; GFX11-FLUSH-NEXT: s_mov_b32 s13, s7
-; GFX11-FLUSH-NEXT: s_mov_b32 s8, s4
+; GFX11-FLUSH-NEXT: s_mov_b32 s0, s4
; GFX11-FLUSH-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-FLUSH-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GFX11-FLUSH-NEXT: s_mov_b32 s9, s5
+; GFX11-FLUSH-NEXT: s_mov_b32 s1, s5
; GFX11-FLUSH-NEXT: v_mul_f16_e32 v0, 0x4200, v0
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_add_f16_e32 v0, v0, v1
-; GFX11-FLUSH-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-FLUSH-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
@@ -575,24 +575,24 @@ define amdgpu_kernel void @fmuladd_f16_imm_b(
; GFX11-DENORM: ; %bb.0:
; GFX11-DENORM-NEXT: s_clause 0x1
; GFX11-DENORM-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-DENORM-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-DENORM-NEXT: s_mov_b32 s10, -1
-; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-DENORM-NEXT: s_mov_b32 s14, s10
-; GFX11-DENORM-NEXT: s_mov_b32 s15, s11
-; GFX11-DENORM-NEXT: s_mov_b32 s2, s10
-; GFX11-DENORM-NEXT: s_mov_b32 s3, s11
+; GFX11-DENORM-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-DENORM-NEXT: s_mov_b32 s2, -1
+; GFX11-DENORM-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-DENORM-NEXT: s_mov_b32 s14, s2
+; GFX11-DENORM-NEXT: s_mov_b32 s15, s3
+; GFX11-DENORM-NEXT: s_mov_b32 s10, s2
+; GFX11-DENORM-NEXT: s_mov_b32 s11, s3
; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DENORM-NEXT: s_mov_b32 s12, s6
; GFX11-DENORM-NEXT: s_mov_b32 s13, s7
; GFX11-DENORM-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-DENORM-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DENORM-NEXT: s_mov_b32 s8, s4
-; GFX11-DENORM-NEXT: s_mov_b32 s9, s5
+; GFX11-DENORM-NEXT: s_mov_b32 s0, s4
+; GFX11-DENORM-NEXT: s_mov_b32 s1, s5
; GFX11-DENORM-NEXT: v_fmac_f16_e32 v1, 0x4200, v0
-; GFX11-DENORM-NEXT: buffer_store_b16 v1, off, s[8:11], 0
+; GFX11-DENORM-NEXT: buffer_store_b16 v1, off, s[0:3], 0
; GFX11-DENORM-NEXT: s_nop 0
; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-NEXT: s_endpgm
@@ -653,27 +653,27 @@ define amdgpu_kernel void @fmuladd_v2f16(
;
; VI-FLUSH-LABEL: fmuladd_v2f16:
; VI-FLUSH: ; %bb.0:
-; VI-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-FLUSH-NEXT: s_mov_b32 s11, 0xf000
-; VI-FLUSH-NEXT: s_mov_b32 s10, -1
-; VI-FLUSH-NEXT: s_mov_b32 s14, s10
-; VI-FLUSH-NEXT: s_mov_b32 s15, s11
+; VI-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-FLUSH-NEXT: s_mov_b32 s3, 0xf000
+; VI-FLUSH-NEXT: s_mov_b32 s2, -1
+; VI-FLUSH-NEXT: s_mov_b32 s14, s2
+; VI-FLUSH-NEXT: s_mov_b32 s15, s3
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; VI-FLUSH-NEXT: s_mov_b32 s12, s2
-; VI-FLUSH-NEXT: s_mov_b32 s13, s3
-; VI-FLUSH-NEXT: s_mov_b32 s16, s4
-; VI-FLUSH-NEXT: s_mov_b32 s17, s5
-; VI-FLUSH-NEXT: s_mov_b32 s18, s10
-; VI-FLUSH-NEXT: s_mov_b32 s19, s11
-; VI-FLUSH-NEXT: s_mov_b32 s4, s6
-; VI-FLUSH-NEXT: s_mov_b32 s5, s7
-; VI-FLUSH-NEXT: s_mov_b32 s6, s10
-; VI-FLUSH-NEXT: s_mov_b32 s7, s11
+; VI-FLUSH-NEXT: s_mov_b32 s12, s6
+; VI-FLUSH-NEXT: s_mov_b32 s13, s7
+; VI-FLUSH-NEXT: s_mov_b32 s16, s8
+; VI-FLUSH-NEXT: s_mov_b32 s17, s9
+; VI-FLUSH-NEXT: s_mov_b32 s18, s2
+; VI-FLUSH-NEXT: s_mov_b32 s19, s3
+; VI-FLUSH-NEXT: s_mov_b32 s8, s10
+; VI-FLUSH-NEXT: s_mov_b32 s9, s11
+; VI-FLUSH-NEXT: s_mov_b32 s10, s2
+; VI-FLUSH-NEXT: s_mov_b32 s11, s3
; VI-FLUSH-NEXT: buffer_load_dword v0, off, s[12:15], 0
-; VI-FLUSH-NEXT: buffer_load_dword v1, off, s[4:7], 0
+; VI-FLUSH-NEXT: buffer_load_dword v1, off, s[8:11], 0
; VI-FLUSH-NEXT: buffer_load_dword v2, off, s[16:19], 0
-; VI-FLUSH-NEXT: s_mov_b32 s8, s0
-; VI-FLUSH-NEXT: s_mov_b32 s9, s1
+; VI-FLUSH-NEXT: s_mov_b32 s0, s4
+; VI-FLUSH-NEXT: s_mov_b32 s1, s5
; VI-FLUSH-NEXT: s_waitcnt vmcnt(1)
; VI-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; VI-FLUSH-NEXT: s_waitcnt vmcnt(0)
@@ -681,32 +681,32 @@ define amdgpu_kernel void @fmuladd_v2f16(
; VI-FLUSH-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; VI-FLUSH-NEXT: v_mac_f16_e32 v1, v0, v2
; VI-FLUSH-NEXT: v_or_b32_e32 v0, v1, v3
-; VI-FLUSH-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-FLUSH-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-FLUSH-NEXT: s_endpgm
;
; VI-DENORM-LABEL: fmuladd_v2f16:
; VI-DENORM: ; %bb.0:
-; VI-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-DENORM-NEXT: s_mov_b32 s11, 0xf000
-; VI-DENORM-NEXT: s_mov_b32 s10, -1
-; VI-DENORM-NEXT: s_mov_b32 s14, s10
-; VI-DENORM-NEXT: s_mov_b32 s15, s11
+; VI-DENORM-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-DENORM-NEXT: s_mov_b32 s3, 0xf000
+; VI-DENORM-NEXT: s_mov_b32 s2, -1
+; VI-DENORM-NEXT: s_mov_b32 s14, s2
+; VI-DENORM-NEXT: s_mov_b32 s15, s3
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; VI-DENORM-NEXT: s_mov_b32 s16, s4
-; VI-DENORM-NEXT: s_mov_b32 s17, s5
-; VI-DENORM-NEXT: s_mov_b32 s4, s6
-; VI-DENORM-NEXT: s_mov_b32 s5, s7
-; VI-DENORM-NEXT: s_mov_b32 s6, s10
-; VI-DENORM-NEXT: s_mov_b32 s7, s11
-; VI-DENORM-NEXT: s_mov_b32 s12, s2
-; VI-DENORM-NEXT: s_mov_b32 s13, s3
-; VI-DENORM-NEXT: s_mov_b32 s18, s10
-; VI-DENORM-NEXT: s_mov_b32 s19, s11
-; VI-DENORM-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; VI-DENORM-NEXT: s_mov_b32 s16, s8
+; VI-DENORM-NEXT: s_mov_b32 s17, s9
+; VI-DENORM-NEXT: s_mov_b32 s8, s10
+; VI-DENORM-NEXT: s_mov_b32 s9, s11
+; VI-DENORM-NEXT: s_mov_b32 s10, s2
+; VI-DENORM-NEXT: s_mov_b32 s11, s3
+; VI-DENORM-NEXT: s_mov_b32 s12, s6
+; VI-DENORM-NEXT: s_mov_b32 s13, s7
+; VI-DENORM-NEXT: s_mov_b32 s18, s2
+; VI-DENORM-NEXT: s_mov_b32 s19, s3
+; VI-DENORM-NEXT: buffer_load_dword v0, off, s[8:11], 0
; VI-DENORM-NEXT: buffer_load_dword v1, off, s[16:19], 0
; VI-DENORM-NEXT: buffer_load_dword v2, off, s[12:15], 0
-; VI-DENORM-NEXT: s_mov_b32 s8, s0
-; VI-DENORM-NEXT: s_mov_b32 s9, s1
+; VI-DENORM-NEXT: s_mov_b32 s0, s4
+; VI-DENORM-NEXT: s_mov_b32 s1, s5
; VI-DENORM-NEXT: s_waitcnt vmcnt(2)
; VI-DENORM-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; VI-DENORM-NEXT: s_waitcnt vmcnt(1)
@@ -717,126 +717,126 @@ define amdgpu_kernel void @fmuladd_v2f16(
; VI-DENORM-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; VI-DENORM-NEXT: v_fma_f16 v0, v2, v1, v0
; VI-DENORM-NEXT: v_or_b32_e32 v0, v0, v3
-; VI-DENORM-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-DENORM-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-DENORM-NEXT: s_endpgm
;
; GFX10-FLUSH-LABEL: fmuladd_v2f16:
; GFX10-FLUSH: ; %bb.0:
-; GFX10-FLUSH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX10-FLUSH-NEXT: s_mov_b32 s10, -1
-; GFX10-FLUSH-NEXT: s_mov_b32 s11, 0x31016000
-; GFX10-FLUSH-NEXT: s_mov_b32 s14, s10
-; GFX10-FLUSH-NEXT: s_mov_b32 s15, s11
-; GFX10-FLUSH-NEXT: s_mov_b32 s18, s10
-; GFX10-FLUSH-NEXT: s_mov_b32 s19, s11
+; GFX10-FLUSH-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX10-FLUSH-NEXT: s_mov_b32 s2, -1
+; GFX10-FLUSH-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-FLUSH-NEXT: s_mov_b32 s14, s2
+; GFX10-FLUSH-NEXT: s_mov_b32 s15, s3
+; GFX10-FLUSH-NEXT: s_mov_b32 s18, s2
+; GFX10-FLUSH-NEXT: s_mov_b32 s19, s3
; GFX10-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FLUSH-NEXT: s_mov_b32 s12, s2
-; GFX10-FLUSH-NEXT: s_mov_b32 s13, s3
-; GFX10-FLUSH-NEXT: s_mov_b32 s16, s4
-; GFX10-FLUSH-NEXT: s_mov_b32 s17, s5
+; GFX10-FLUSH-NEXT: s_mov_b32 s12, s6
+; GFX10-FLUSH-NEXT: s_mov_b32 s13, s7
+; GFX10-FLUSH-NEXT: s_mov_b32 s16, s8
+; GFX10-FLUSH-NEXT: s_mov_b32 s17, s9
; GFX10-FLUSH-NEXT: buffer_load_dword v0, off, s[12:15], 0
; GFX10-FLUSH-NEXT: buffer_load_dword v1, off, s[16:19], 0
-; GFX10-FLUSH-NEXT: s_mov_b32 s4, s6
-; GFX10-FLUSH-NEXT: s_mov_b32 s5, s7
-; GFX10-FLUSH-NEXT: s_mov_b32 s6, s10
-; GFX10-FLUSH-NEXT: s_mov_b32 s7, s11
-; GFX10-FLUSH-NEXT: s_mov_b32 s8, s0
-; GFX10-FLUSH-NEXT: buffer_load_dword v2, off, s[4:7], 0
-; GFX10-FLUSH-NEXT: s_mov_b32 s9, s1
+; GFX10-FLUSH-NEXT: s_mov_b32 s8, s10
+; GFX10-FLUSH-NEXT: s_mov_b32 s9, s11
+; GFX10-FLUSH-NEXT: s_mov_b32 s10, s2
+; GFX10-FLUSH-NEXT: s_mov_b32 s11, s3
+; GFX10-FLUSH-NEXT: s_mov_b32 s0, s4
+; GFX10-FLUSH-NEXT: buffer_load_dword v2, off, s[8:11], 0
+; GFX10-FLUSH-NEXT: s_mov_b32 s1, s5
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(1)
; GFX10-FLUSH-NEXT: v_pk_mul_f16 v0, v0, v1
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLUSH-NEXT: v_pk_add_f16 v0, v0, v2
-; GFX10-FLUSH-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; GFX10-FLUSH-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX10-FLUSH-NEXT: s_endpgm
;
; GFX10-DENORM-LABEL: fmuladd_v2f16:
; GFX10-DENORM: ; %bb.0:
-; GFX10-DENORM-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX10-DENORM-NEXT: s_mov_b32 s10, -1
-; GFX10-DENORM-NEXT: s_mov_b32 s11, 0x31016000
-; GFX10-DENORM-NEXT: s_mov_b32 s14, s10
-; GFX10-DENORM-NEXT: s_mov_b32 s15, s11
-; GFX10-DENORM-NEXT: s_mov_b32 s18, s10
-; GFX10-DENORM-NEXT: s_mov_b32 s19, s11
-; GFX10-DENORM-NEXT: s_mov_b32 s22, s10
-; GFX10-DENORM-NEXT: s_mov_b32 s23, s11
+; GFX10-DENORM-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX10-DENORM-NEXT: s_mov_b32 s2, -1
+; GFX10-DENORM-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-DENORM-NEXT: s_mov_b32 s14, s2
+; GFX10-DENORM-NEXT: s_mov_b32 s15, s3
+; GFX10-DENORM-NEXT: s_mov_b32 s18, s2
+; GFX10-DENORM-NEXT: s_mov_b32 s19, s3
+; GFX10-DENORM-NEXT: s_mov_b32 s22, s2
+; GFX10-DENORM-NEXT: s_mov_b32 s23, s3
; GFX10-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-DENORM-NEXT: s_mov_b32 s12, s2
-; GFX10-DENORM-NEXT: s_mov_b32 s13, s3
-; GFX10-DENORM-NEXT: s_mov_b32 s16, s4
-; GFX10-DENORM-NEXT: s_mov_b32 s17, s5
-; GFX10-DENORM-NEXT: s_mov_b32 s20, s6
-; GFX10-DENORM-NEXT: s_mov_b32 s21, s7
+; GFX10-DENORM-NEXT: s_mov_b32 s12, s6
+; GFX10-DENORM-NEXT: s_mov_b32 s13, s7
+; GFX10-DENORM-NEXT: s_mov_b32 s16, s8
+; GFX10-DENORM-NEXT: s_mov_b32 s17, s9
+; GFX10-DENORM-NEXT: s_mov_b32 s20, s10
+; GFX10-DENORM-NEXT: s_mov_b32 s21, s11
; GFX10-DENORM-NEXT: buffer_load_dword v0, off, s[12:15], 0
; GFX10-DENORM-NEXT: buffer_load_dword v1, off, s[16:19], 0
; GFX10-DENORM-NEXT: buffer_load_dword v2, off, s[20:23], 0
-; GFX10-DENORM-NEXT: s_mov_b32 s8, s0
-; GFX10-DENORM-NEXT: s_mov_b32 s9, s1
+; GFX10-DENORM-NEXT: s_mov_b32 s0, s4
+; GFX10-DENORM-NEXT: s_mov_b32 s1, s5
; GFX10-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX10-DENORM-NEXT: v_pk_fma_f16 v0, v0, v1, v2
-; GFX10-DENORM-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; GFX10-DENORM-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX10-DENORM-NEXT: s_endpgm
;
; GFX11-FLUSH-LABEL: fmuladd_v2f16:
; GFX11-FLUSH: ; %bb.0:
-; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1
-; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10
-; GFX11-FLUSH-NEXT: s_mov_b32 s15, s11
-; GFX11-FLUSH-NEXT: s_mov_b32 s18, s10
-; GFX11-FLUSH-NEXT: s_mov_b32 s19, s11
+; GFX11-FLUSH-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-FLUSH-NEXT: s_mov_b32 s2, -1
+; GFX11-FLUSH-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-FLUSH-NEXT: s_mov_b32 s14, s2
+; GFX11-FLUSH-NEXT: s_mov_b32 s15, s3
+; GFX11-FLUSH-NEXT: s_mov_b32 s18, s2
+; GFX11-FLUSH-NEXT: s_mov_b32 s19, s3
; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-FLUSH-NEXT: s_mov_b32 s12, s2
-; GFX11-FLUSH-NEXT: s_mov_b32 s13, s3
-; GFX11-FLUSH-NEXT: s_mov_b32 s16, s4
-; GFX11-FLUSH-NEXT: s_mov_b32 s17, s5
+; GFX11-FLUSH-NEXT: s_mov_b32 s12, s6
+; GFX11-FLUSH-NEXT: s_mov_b32 s13, s7
+; GFX11-FLUSH-NEXT: s_mov_b32 s16, s8
+; GFX11-FLUSH-NEXT: s_mov_b32 s17, s9
; GFX11-FLUSH-NEXT: buffer_load_b32 v0, off, s[12:15], 0
; GFX11-FLUSH-NEXT: buffer_load_b32 v1, off, s[16:19], 0
-; GFX11-FLUSH-NEXT: s_mov_b32 s4, s6
-; GFX11-FLUSH-NEXT: s_mov_b32 s5, s7
-; GFX11-FLUSH-NEXT: s_mov_b32 s6, s10
-; GFX11-FLUSH-NEXT: s_mov_b32 s7, s11
-; GFX11-FLUSH-NEXT: s_mov_b32 s8, s0
-; GFX11-FLUSH-NEXT: buffer_load_b32 v2, off, s[4:7], 0
-; GFX11-FLUSH-NEXT: s_mov_b32 s9, s1
+; GFX11-FLUSH-NEXT: s_mov_b32 s8, s10
+; GFX11-FLUSH-NEXT: s_mov_b32 s9, s11
+; GFX11-FLUSH-NEXT: s_mov_b32 s10, s2
+; GFX11-FLUSH-NEXT: s_mov_b32 s11, s3
+; GFX11-FLUSH-NEXT: s_mov_b32 s0, s4
+; GFX11-FLUSH-NEXT: buffer_load_b32 v2, off, s[8:11], 0
+; GFX11-FLUSH-NEXT: s_mov_b32 s1, s5
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(1)
; GFX11-FLUSH-NEXT: v_pk_mul_f16 v0, v0, v1
; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0)
; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FLUSH-NEXT: v_pk_add_f16 v0, v0, v2
-; GFX11-FLUSH-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-FLUSH-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-FLUSH-NEXT: s_nop 0
; GFX11-FLUSH-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FLUSH-NEXT: s_endpgm
;
; GFX11-DENORM-LABEL: fmuladd_v2f16:
; GFX11-DENORM: ; %bb.0:
-; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-DENORM-NEXT: s_mov_b32 s10, -1
-; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-DENORM-NEXT: s_mov_b32 s14, s10
-; GFX11-DENORM-NEXT: s_mov_b32 s15, s11
-; GFX11-DENORM-NEXT: s_mov_b32 s18, s10
-; GFX11-DENORM-NEXT: s_mov_b32 s19, s11
-; GFX11-DENORM-NEXT: s_mov_b32 s22, s10
-; GFX11-DENORM-NEXT: s_mov_b32 s23, s11
+; GFX11-DENORM-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-DENORM-NEXT: s_mov_b32 s2, -1
+; GFX11-DENORM-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-DENORM-NEXT: s_mov_b32 s14, s2
+; GFX11-DENORM-NEXT: s_mov_b32 s15, s3
+; GFX11-DENORM-NEXT: s_mov_b32 s18, s2
+; GFX11-DENORM-NEXT: s_mov_b32 s19, s3
+; GFX11-DENORM-NEXT: s_mov_b32 s22, s2
+; GFX11-DENORM-NEXT: s_mov_b32 s23, s3
; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-DENORM-NEXT: s_mov_b32 s12, s2
-; GFX11-DENORM-NEXT: s_mov_b32 s13, s3
-; GFX11-DENORM-NEXT: s_mov_b32 s16, s4
-; GFX11-DENORM-NEXT: s_mov_b32 s17, s5
-; GFX11-DENORM-NEXT: s_mov_b32 s20, s6
-; GFX11-DENORM-NEXT: s_mov_b32 s21, s7
+; GFX11-DENORM-NEXT: s_mov_b32 s12, s6
+; GFX11-DENORM-NEXT: s_mov_b32 s13, s7
+; GFX11-DENORM-NEXT: s_mov_b32 s16, s8
+; GFX11-DENORM-NEXT: s_mov_b32 s17, s9
+; GFX11-DENORM-NEXT: s_mov_b32 s20, s10
+; GFX11-DENORM-NEXT: s_mov_b32 s21, s11
; GFX11-DENORM-NEXT: buffer_load_b32 v0, off, s[12:15], 0
; GFX11-DENORM-NEXT: buffer_load_b32 v1, off, s[16:19], 0
; GFX11-DENORM-NEXT: buffer_load_b32 v2, off, s[20:23], 0
-; GFX11-DENORM-NEXT: s_mov_b32 s8, s0
-; GFX11-DENORM-NEXT: s_mov_b32 s9, s1
+; GFX11-DENORM-NEXT: s_mov_b32 s0, s4
+; GFX11-DENORM-NEXT: s_mov_b32 s1, s5
; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0)
; GFX11-DENORM-NEXT: v_pk_fma_f16 v0, v0, v1, v2
-; GFX11-DENORM-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-DENORM-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-DENORM-NEXT: s_nop 0
; GFX11-DENORM-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-DENORM-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll b/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll
index aca7d3c720ceb..df4d3fcf14076 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.get.fpmode.ll
@@ -107,46 +107,46 @@ define amdgpu_kernel void @kernel_fpmode_i32(ptr addrspace(1) %ptr) {
;
; GFX8-LABEL: kernel_fpmode_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_getreg_b32 s2, hwreg(HW_REG_MODE, 0, 19)
-; GFX8-NEXT: s_and_b32 s2, 0x7f3ff, s2
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 19)
+; GFX8-NEXT: s_and_b32 s0, 0x7f3ff, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: kernel_fpmode_i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT: s_getreg_b32 s2, hwreg(HW_REG_MODE, 0, 24)
-; GFX9-NEXT: s_and_b32 s2, 0x87f3ff, s2
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 24)
+; GFX9-NEXT: s_and_b32 s0, 0x87f3ff, s0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: kernel_fpmode_i32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT: s_getreg_b32 s2, hwreg(HW_REG_MODE, 0, 24)
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 24)
; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: s_and_b32 s2, 0x87f3ff, s2
-; GFX10-NEXT: v_mov_b32_e32 v1, s2
+; GFX10-NEXT: s_and_b32 s0, 0x87f3ff, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: kernel_fpmode_i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_getreg_b32 s2, hwreg(HW_REG_MODE, 0, 24)
-; GFX11-NEXT: s_and_b32 s2, 0x87f3ff, s2
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 24)
+; GFX11-NEXT: s_and_b32 s0, 0x87f3ff, s0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
index ea823f30f26c2..1f62bccc7d1db 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
@@ -28,16 +28,16 @@ define amdgpu_kernel void @sgpr_isnan_bf16(ptr addrspace(1) %out, bfloat %x) {
;
; GFX8CHECK-LABEL: sgpr_isnan_bf16:
; GFX8CHECK: ; %bb.0:
-; GFX8CHECK-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX8CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8CHECK-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX8CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8CHECK-NEXT: v_mov_b32_e32 v0, 0x7fff
-; GFX8CHECK-NEXT: s_movk_i32 s3, 0x7f80
+; GFX8CHECK-NEXT: s_movk_i32 s0, 0x7f80
; GFX8CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8CHECK-NEXT: v_and_b32_e32 v0, s2, v0
-; GFX8CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s3, v0
-; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s0
+; GFX8CHECK-NEXT: v_and_b32_e32 v0, s4, v0
+; GFX8CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s0, v0
+; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s2
; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
-; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s1
+; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s3
; GFX8CHECK-NEXT: flat_store_dword v[0:1], v2
; GFX8CHECK-NEXT: s_endpgm
;
@@ -58,27 +58,27 @@ define amdgpu_kernel void @sgpr_isnan_bf16(ptr addrspace(1) %out, bfloat %x) {
; GFX10CHECK-LABEL: sgpr_isnan_bf16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_clause 0x1
-; GFX10CHECK-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX10CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10CHECK-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX10CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10CHECK-NEXT: v_mov_b32_e32 v1, 0
; GFX10CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10CHECK-NEXT: v_and_b32_e64 v0, 0x7fff, s2
+; GFX10CHECK-NEXT: v_and_b32_e64 v0, 0x7fff, s4
; GFX10CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX10CHECK-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10CHECK-NEXT: global_store_dword v1, v0, s[2:3]
; GFX10CHECK-NEXT: s_endpgm
;
; GFX11CHECK-LABEL: sgpr_isnan_bf16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_clause 0x1
-; GFX11CHECK-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11CHECK-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11CHECK-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11CHECK-NEXT: v_mov_b32_e32 v1, 0
; GFX11CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11CHECK-NEXT: v_and_b32_e64 v0, 0x7fff, s2
+; GFX11CHECK-NEXT: v_and_b32_e64 v0, 0x7fff, s4
; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0
; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc_lo
-; GFX11CHECK-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11CHECK-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11CHECK-NEXT: s_nop 0
; GFX11CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11CHECK-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
index da64c379672ef..26c426a653484 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
@@ -43,13 +43,13 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) {
;
; GFX8CHECK-LABEL: sgpr_isnan_f16:
; GFX8CHECK: ; %bb.0:
-; GFX8CHECK-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX8CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8CHECK-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX8CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[2:3], s2, 3
-; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s0
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3]
-; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s1
+; GFX8CHECK-NEXT: v_cmp_class_f16_e64 s[0:1], s4, 3
+; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s2
+; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1]
+; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s3
; GFX8CHECK-NEXT: flat_store_dword v[0:1], v2
; GFX8CHECK-NEXT: s_endpgm
;
@@ -67,25 +67,25 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) {
; GFX10CHECK-LABEL: sgpr_isnan_f16:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_clause 0x1
-; GFX10CHECK-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX10CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10CHECK-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX10CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10CHECK-NEXT: v_mov_b32_e32 v0, 0
; GFX10CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s2, s2, 3
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
-; GFX10CHECK-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10CHECK-NEXT: v_cmp_class_f16_e64 s0, s4, 3
+; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s0
+; GFX10CHECK-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10CHECK-NEXT: s_endpgm
;
; GFX11CHECK-LABEL: sgpr_isnan_f16:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_clause 0x1
-; GFX11CHECK-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11CHECK-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11CHECK-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11CHECK-NEXT: v_mov_b32_e32 v0, 0
; GFX11CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s2, s2, 3
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
-; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, s4, 3
+; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s0
+; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11CHECK-NEXT: s_nop 0
; GFX11CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11CHECK-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
index 347e549e7cf56..c7e7e7bd82478 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.ll
@@ -37,13 +37,13 @@ define amdgpu_kernel void @sgpr_isnan_f32(ptr addrspace(1) %out, float %x) {
;
; GFX8CHECK-LABEL: sgpr_isnan_f32:
; GFX8CHECK: ; %bb.0:
-; GFX8CHECK-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX8CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8CHECK-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX8CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8CHECK-NEXT: v_cmp_class_f32_e64 s[2:3], s2, 3
-; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s0
-; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3]
-; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s1
+; GFX8CHECK-NEXT: v_cmp_class_f32_e64 s[0:1], s4, 3
+; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s2
+; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1]
+; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s3
; GFX8CHECK-NEXT: flat_store_dword v[0:1], v2
; GFX8CHECK-NEXT: s_endpgm
;
@@ -61,26 +61,26 @@ define amdgpu_kernel void @sgpr_isnan_f32(ptr addrspace(1) %out, float %x) {
; GFX10CHECK-LABEL: sgpr_isnan_f32:
; GFX10CHECK: ; %bb.0:
; GFX10CHECK-NEXT: s_clause 0x1
-; GFX10CHECK-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX10CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10CHECK-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX10CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10CHECK-NEXT: v_mov_b32_e32 v0, 0
; GFX10CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s2, s2, 3
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
-; GFX10CHECK-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10CHECK-NEXT: v_cmp_class_f32_e64 s0, s4, 3
+; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s0
+; GFX10CHECK-NEXT: global_store_dword v0, v1, s[2:3]
; GFX10CHECK-NEXT: s_endpgm
;
; GFX11CHECK-LABEL: sgpr_isnan_f32:
; GFX11CHECK: ; %bb.0:
; GFX11CHECK-NEXT: s_clause 0x1
-; GFX11CHECK-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11CHECK-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11CHECK-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11CHECK-NEXT: v_mov_b32_e32 v0, 0
; GFX11CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11CHECK-NEXT: v_cmp_class_f32_e64 s2, s2, 3
+; GFX11CHECK-NEXT: v_cmp_class_f32_e64 s0, s4, 3
; GFX11CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
-; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s0
+; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11CHECK-NEXT: s_nop 0
; GFX11CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11CHECK-NEXT: s_endpgm
@@ -115,57 +115,46 @@ define amdgpu_kernel void @sgpr_isnan_f64(ptr addrspace(1) %out, double %x) {
; GFX7GLISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7GLISEL-NEXT: s_endpgm
;
-; GFX8SELDAG-LABEL: sgpr_isnan_f64:
-; GFX8SELDAG: ; %bb.0:
-; GFX8SELDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8SELDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8SELDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX8SELDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX8SELDAG-NEXT: v_cmp_class_f64_e64 s[0:1], s[2:3], 3
-; GFX8SELDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX8SELDAG-NEXT: flat_store_dword v[0:1], v2
-; GFX8SELDAG-NEXT: s_endpgm
-;
-; GFX8GLISEL-LABEL: sgpr_isnan_f64:
-; GFX8GLISEL: ; %bb.0:
-; GFX8GLISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8GLISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8GLISEL-NEXT: v_cmp_class_f64_e64 s[2:3], s[2:3], 3
-; GFX8GLISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX8GLISEL-NEXT: v_mov_b32_e32 v1, s1
-; GFX8GLISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[2:3]
-; GFX8GLISEL-NEXT: flat_store_dword v[0:1], v2
-; GFX8GLISEL-NEXT: s_endpgm
+; GFX8CHECK-LABEL: sgpr_isnan_f64:
+; GFX8CHECK: ; %bb.0:
+; GFX8CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8CHECK-NEXT: v_cmp_class_f64_e64 s[0:1], s[6:7], 3
+; GFX8CHECK-NEXT: v_mov_b32_e32 v0, s4
+; GFX8CHECK-NEXT: v_mov_b32_e32 v1, s5
+; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1]
+; GFX8CHECK-NEXT: flat_store_dword v[0:1], v2
+; GFX8CHECK-NEXT: s_endpgm
;
; GFX9CHECK-LABEL: sgpr_isnan_f64:
; GFX9CHECK: ; %bb.0:
-; GFX9CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9CHECK-NEXT: v_mov_b32_e32 v0, 0
; GFX9CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9CHECK-NEXT: v_cmp_class_f64_e64 s[2:3], s[2:3], 3
-; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[2:3]
-; GFX9CHECK-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9CHECK-NEXT: v_cmp_class_f64_e64 s[0:1], s[6:7], 3
+; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1]
+; GFX9CHECK-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9CHECK-NEXT: s_endpgm
;
; GFX10CHECK-LABEL: sgpr_isnan_f64:
; GFX10CHECK: ; %bb.0:
-; GFX10CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10CHECK-NEXT: v_mov_b32_e32 v0, 0
; GFX10CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10CHECK-NEXT: v_cmp_class_f64_e64 s2, s[2:3], 3
-; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
-; GFX10CHECK-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10CHECK-NEXT: v_cmp_class_f64_e64 s0, s[6:7], 3
+; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s0
+; GFX10CHECK-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10CHECK-NEXT: s_endpgm
;
; GFX11CHECK-LABEL: sgpr_isnan_f64:
; GFX11CHECK: ; %bb.0:
-; GFX11CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11CHECK-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11CHECK-NEXT: v_mov_b32_e32 v0, 0
; GFX11CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11CHECK-NEXT: v_cmp_class_f64_e64 s2, s[2:3], 3
+; GFX11CHECK-NEXT: v_cmp_class_f64_e64 s0, s[6:7], 3
; GFX11CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2
-; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s0
+; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11CHECK-NEXT: s_nop 0
; GFX11CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11CHECK-NEXT: s_endpgm
@@ -1469,3 +1458,6 @@ declare <7 x i1> @llvm.is.fpclass.v7f32(<7 x float>, i32)
declare <8 x i1> @llvm.is.fpclass.v8f32(<8 x float>, i32)
declare <16 x i1> @llvm.is.fpclass.v16f32(<16 x float>, i32)
declare <2 x i1> @llvm.is.fpclass.v2f64(<2 x double>, i32)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX8GLISEL: {{.*}}
+; GFX8SELDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index ad70589b544ee..6f1d37476e62d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -188,14 +188,13 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
;
; GFX1100-SDAG-LABEL: s_log_f32:
; GFX1100-SDAG: ; %bb.0:
-; GFX1100-SDAG-NEXT: s_clause 0x1
; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0
; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
@@ -207,23 +206,23 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
; GFX1100-SDAG-NEXT: v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s3
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, s4
; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX1100-SDAG-NEXT: s_nop 0
; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-SDAG-NEXT: s_endpgm
;
; GFX1100-GISEL-LABEL: s_log_f32:
; GFX1100-GISEL: ; %bb.0:
-; GFX1100-GISEL-NEXT: s_clause 0x1
; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
@@ -233,11 +232,12 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3377d1cf, v0
; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x41b17218, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x41b17218, s4
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v1 :: v_dual_mov_b32 v1, 0
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2
-; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1100-GISEL-NEXT: s_nop 0
; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-GISEL-NEXT: s_endpgm
@@ -1090,18 +1090,18 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; GFX1100-SDAG: ; %bb.0:
; GFX1100-SDAG-NEXT: s_clause 0x1
; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s6
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s2
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s3
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s7
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s8
; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s6, v0 :: v_dual_mul_f32 v1, s5, v1
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0
@@ -1122,7 +1122,7 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v7, 0x3377d1cf, v1
; GFX1100-SDAG-NEXT: v_add_f32_e32 v3, v3, v6
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s7
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s9
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_add_f32_e32 v4, v4, v7
; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
@@ -1137,7 +1137,7 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; GFX1100-SDAG-NEXT: v_sub_f32_e32 v2, v0, v9
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v3, v6
-; GFX1100-SDAG-NEXT: global_store_b96 v4, v[0:2], s[0:1]
+; GFX1100-SDAG-NEXT: global_store_b96 v4, v[0:2], s[2:3]
; GFX1100-SDAG-NEXT: s_nop 0
; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-SDAG-NEXT: s_endpgm
@@ -1146,18 +1146,18 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_clause 0x1
; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s3
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s2
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x41b17218, s8
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s7
; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
@@ -1178,7 +1178,7 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v7, 0x3377d1cf, v1
; GFX1100-GISEL-NEXT: v_add_f32_e32 v3, v3, v6
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s7
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x41b17218, s9
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-GISEL-NEXT: v_add_f32_e32 v4, v4, v7
; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
@@ -1193,7 +1193,7 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX1100-GISEL-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX1100-GISEL-NEXT: global_store_b96 v3, v[0:2], s[2:3]
; GFX1100-GISEL-NEXT: s_nop 0
; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-GISEL-NEXT: s_endpgm
@@ -1775,31 +1775,31 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; GFX1100-SDAG: ; %bb.0:
; GFX1100-SDAG-NEXT: s_clause 0x1
; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s7
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s6
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s7
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s5
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s4
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s2
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s8
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s7, v0 :: v_dual_mul_f32 v1, s6, v1
; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s5, v2 :: v_dual_mul_f32 v3, s4, v3
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s3
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s9
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0
; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2
; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, v3
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s8
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s9
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s10
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s11
; GFX1100-SDAG-NEXT: v_dual_mul_f32 v5, 0x3f317217, v0 :: v_dual_mul_f32 v6, 0x3f317217, v1
; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-SDAG-NEXT: v_dual_mul_f32 v7, 0x3f317217, v2 :: v_dual_mul_f32 v8, 0x3f317217, v3
@@ -1827,7 +1827,7 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; GFX1100-SDAG-NEXT: v_sub_f32_e32 v3, v0, v4
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v5, v14 :: v_dual_sub_f32 v0, v6, v15
-; GFX1100-SDAG-NEXT: global_store_b128 v7, v[0:3], s[0:1]
+; GFX1100-SDAG-NEXT: global_store_b128 v7, v[0:3], s[2:3]
; GFX1100-SDAG-NEXT: s_nop 0
; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-SDAG-NEXT: s_endpgm
@@ -1836,31 +1836,31 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_clause 0x1
; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s6
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s7
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s4
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s5
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s6
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s7
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s2
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x41b17218, s8
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1
; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v3, s7, v3
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x41b17218, s9
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2
; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s8
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s9
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x41b17218, s10
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x41b17218, s11
; GFX1100-GISEL-NEXT: v_dual_mul_f32 v5, 0x3f317217, v0 :: v_dual_mul_f32 v6, 0x3f317217, v1
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_dual_mul_f32 v7, 0x3f317217, v2 :: v_dual_mul_f32 v8, 0x3f317217, v3
@@ -1889,7 +1889,7 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v3, v3, v8 :: v_dual_sub_f32 v2, v2, v14
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v3, v3, v15
-; GFX1100-GISEL-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX1100-GISEL-NEXT: global_store_b128 v4, v[0:3], s[2:3]
; GFX1100-GISEL-NEXT: s_nop 0
; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index 82c73fa441aaf..e8671f56464c5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -188,14 +188,13 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
;
; GFX1100-SDAG-LABEL: s_log10_f32:
; GFX1100-SDAG: ; %bb.0:
-; GFX1100-SDAG-NEXT: s_clause 0x1
; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
; GFX1100-SDAG-NEXT: v_mul_f32_e32 v0, s2, v0
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0
; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
@@ -207,23 +206,23 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
; GFX1100-SDAG-NEXT: v_dual_add_f32 v1, v1, v2 :: v_dual_mov_b32 v2, 0
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, s3
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, s4
; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX1100-SDAG-NEXT: s_nop 0
; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-SDAG-NEXT: s_endpgm
;
; GFX1100-GISEL-LABEL: s_log10_f32:
; GFX1100-GISEL: ; %bb.0:
-; GFX1100-GISEL-NEXT: s_clause 0x1
; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
@@ -233,11 +232,12 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v2, 0x3284fbcf, v0
; GFX1100-GISEL-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x411a209b, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 0x411a209b, s4
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v1 :: v_dual_mov_b32 v1, 0
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v0, v0, v2
-; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1100-GISEL-NEXT: s_nop 0
; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-GISEL-NEXT: s_endpgm
@@ -1090,18 +1090,18 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX1100-SDAG: ; %bb.0:
; GFX1100-SDAG-NEXT: s_clause 0x1
; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s6
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s2
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s3
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s7
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s8
; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s6, v0 :: v_dual_mul_f32 v1, s5, v1
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0
@@ -1122,7 +1122,7 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_fmac_f32_e32 v7, 0x3284fbcf, v1
; GFX1100-SDAG-NEXT: v_add_f32_e32 v3, v3, v6
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s7
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s9
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_add_f32_e32 v4, v4, v7
; GFX1100-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
@@ -1137,7 +1137,7 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX1100-SDAG-NEXT: v_sub_f32_e32 v2, v0, v9
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v3, v6
-; GFX1100-SDAG-NEXT: global_store_b96 v4, v[0:2], s[0:1]
+; GFX1100-SDAG-NEXT: global_store_b96 v4, v[0:2], s[2:3]
; GFX1100-SDAG-NEXT: s_nop 0
; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-SDAG-NEXT: s_endpgm
@@ -1146,18 +1146,18 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_clause 0x1
; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s3
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s2
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 0x411a209b, s8
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s7
; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
@@ -1178,7 +1178,7 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-GISEL-NEXT: v_fmac_f32_e32 v7, 0x3284fbcf, v1
; GFX1100-GISEL-NEXT: v_add_f32_e32 v3, v3, v6
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s7
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x411a209b, s9
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-GISEL-NEXT: v_add_f32_e32 v4, v4, v7
; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
@@ -1193,7 +1193,7 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX1100-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX1100-GISEL-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX1100-GISEL-NEXT: global_store_b96 v3, v[0:2], s[2:3]
; GFX1100-GISEL-NEXT: s_nop 0
; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-GISEL-NEXT: s_endpgm
@@ -1775,31 +1775,31 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; GFX1100-SDAG: ; %bb.0:
; GFX1100-SDAG-NEXT: s_clause 0x1
; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s7
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s6
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s4
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s7
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s5
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s4
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s2
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s8
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_dual_mul_f32 v0, s7, v0 :: v_dual_mul_f32 v1, s6, v1
; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s5, v2 :: v_dual_mul_f32 v3, s4, v3
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s3
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s9
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v0, v0
; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2
; GFX1100-SDAG-NEXT: v_log_f32_e32 v3, v3
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s8
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s9
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s10
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s11
; GFX1100-SDAG-NEXT: v_dual_mul_f32 v5, 0x3e9a209a, v0 :: v_dual_mul_f32 v6, 0x3e9a209a, v1
; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-SDAG-NEXT: v_dual_mul_f32 v7, 0x3e9a209a, v2 :: v_dual_mul_f32 v8, 0x3e9a209a, v3
@@ -1827,7 +1827,7 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; GFX1100-SDAG-NEXT: v_sub_f32_e32 v3, v0, v4
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v5, v14 :: v_dual_sub_f32 v0, v6, v15
-; GFX1100-SDAG-NEXT: global_store_b128 v7, v[0:3], s[0:1]
+; GFX1100-SDAG-NEXT: global_store_b128 v7, v[0:3], s[2:3]
; GFX1100-SDAG-NEXT: s_nop 0
; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-SDAG-NEXT: s_endpgm
@@ -1836,31 +1836,31 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_clause 0x1
; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s6
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s7
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s4
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s5
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s6
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s7
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s2
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x411a209b, s8
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1
; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v3, s7, v3
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 0x411a209b, s9
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2
; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s8
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s9
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 0x411a209b, s10
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 0x411a209b, s11
; GFX1100-GISEL-NEXT: v_dual_mul_f32 v5, 0x3e9a209a, v0 :: v_dual_mul_f32 v6, 0x3e9a209a, v1
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_dual_mul_f32 v7, 0x3e9a209a, v2 :: v_dual_mul_f32 v8, 0x3e9a209a, v3
@@ -1889,7 +1889,7 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; GFX1100-GISEL-NEXT: v_dual_cndmask_b32 v3, v3, v8 :: v_dual_sub_f32 v2, v2, v14
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v3, v3, v15
-; GFX1100-GISEL-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX1100-GISEL-NEXT: global_store_b128 v4, v[0:3], s[2:3]
; GFX1100-GISEL-NEXT: s_nop 0
; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index b76e6214922bd..88b5e6159230d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -52,39 +52,39 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) {
;
; VI-SDAG-LABEL: s_log2_f32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
+; VI-SDAG-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
; VI-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x4f800000
; VI-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; VI-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1
+; VI-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1
; VI-SDAG-NEXT: v_log_f32_e32 v1, v1
; VI-SDAG-NEXT: v_sub_f32_e32 v2, v1, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
; VI-SDAG-NEXT: flat_store_dword v[0:1], v2
; VI-SDAG-NEXT: s_endpgm
;
; VI-GISEL-LABEL: s_log2_f32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
+; VI-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
; VI-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; VI-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0
+; VI-GISEL-NEXT: v_mul_f32_e32 v0, s4, v0
; VI-GISEL-NEXT: v_log_f32_e32 v0, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
; VI-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; VI-GISEL-NEXT: v_sub_f32_e32 v2, v0, v1
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
; VI-GISEL-NEXT: flat_store_dword v[0:1], v2
; VI-GISEL-NEXT: s_endpgm
;
@@ -111,56 +111,57 @@ define amdgpu_kernel void @s_log2_f32(ptr addrspace(1) %out, float %in) {
; GFX900-GISEL-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX900-GISEL-NEXT: v_mov_b32_e32 v0, 0x800000
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; GFX900-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
; GFX900-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0
; GFX900-GISEL-NEXT: v_log_f32_e32 v0, v0
+; GFX900-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0x42000000
; GFX900-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX900-GISEL-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX900-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX900-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-GISEL-NEXT: global_store_dword v1, v0, s[2:3]
; GFX900-GISEL-NEXT: s_endpgm
;
; GFX1100-SDAG-LABEL: s_log2_f32:
; GFX1100-SDAG: ; %bb.0:
-; GFX1100-SDAG-NEXT: s_clause 0x1
-; GFX1100-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1100-SDAG-NEXT: s_load_b32 s4, s[0:1], 0x2c
; GFX1100-SDAG-NEXT: v_mov_b32_e32 v2, 0
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s3
-; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, s2, v1
+; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s2
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s2
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1100-SDAG-NEXT: v_mul_f32_e32 v1, s4, v1
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v1, v1
; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v1, v0
-; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX1100-SDAG-NEXT: s_nop 0
; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-SDAG-NEXT: s_endpgm
;
; GFX1100-GISEL-LABEL: s_log2_f32:
; GFX1100-GISEL: ; %bb.0:
-; GFX1100-GISEL-NEXT: s_clause 0x1
; GFX1100-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s2
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s4, 0x800000, s2
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s3
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s4
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s4
; GFX1100-GISEL-NEXT: v_mul_f32_e32 v0, s2, v0
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v1 :: v_dual_mov_b32 v1, 0
-; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-GISEL-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX1100-GISEL-NEXT: s_nop 0
; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-GISEL-NEXT: s_endpgm
@@ -537,7 +538,7 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-SDAG-LABEL: s_log2_v3f32:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000
; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000
@@ -559,9 +560,9 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; VI-SDAG-NEXT: v_log_f32_e32 v6, v6
; VI-SDAG-NEXT: v_sub_f32_e32 v2, v4, v2
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v3, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v4, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, s3
; VI-SDAG-NEXT: v_sub_f32_e32 v1, v6, v5
-; VI-SDAG-NEXT: v_mov_b32_e32 v3, s0
+; VI-SDAG-NEXT: v_mov_b32_e32 v3, s2
; VI-SDAG-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; VI-SDAG-NEXT: s_endpgm
;
@@ -658,9 +659,7 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
;
; GFX1100-SDAG-LABEL: s_log2_v3f32:
; GFX1100-SDAG: ; %bb.0:
-; GFX1100-SDAG-NEXT: s_clause 0x1
; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s6
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5
@@ -668,23 +667,25 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s2
; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 1.0, 0x4f800000, s3
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 1.0, 0x4f800000, s7
; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s2
; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s3
-; GFX1100-SDAG-NEXT: v_mul_f32_e32 v2, s6, v2
-; GFX1100-SDAG-NEXT: v_dual_mul_f32 v4, s5, v4 :: v_dual_mul_f32 v5, s4, v5
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v5, s4, v5
+; GFX1100-SDAG-NEXT: v_mul_f32_e32 v4, s5, v4
; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s7
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2
-; GFX1100-SDAG-NEXT: v_log_f32_e32 v4, v4
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v5, v5
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_3)
+; GFX1100-SDAG-NEXT: v_log_f32_e32 v4, v4
; GFX1100-SDAG-NEXT: v_mov_b32_e32 v6, 0
+; GFX1100-SDAG-NEXT: v_sub_f32_e32 v2, v2, v0
; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
-; GFX1100-SDAG-NEXT: v_dual_sub_f32 v2, v2, v0 :: v_dual_sub_f32 v1, v4, v1
-; GFX1100-SDAG-NEXT: v_sub_f32_e32 v0, v5, v3
-; GFX1100-SDAG-NEXT: global_store_b96 v6, v[0:2], s[0:1]
+; GFX1100-SDAG-NEXT: v_dual_sub_f32 v0, v5, v3 :: v_dual_sub_f32 v1, v4, v1
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_store_b96 v6, v[0:2], s[2:3]
; GFX1100-SDAG-NEXT: s_nop 0
; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-SDAG-NEXT: s_endpgm
@@ -693,20 +694,20 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_clause 0x1
; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s6
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s7, 0x800000, s4
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s5
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s6
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s7
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s8
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s7
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s3
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s2
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s9
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 0x42000000, s7
; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s7
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s9
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1
@@ -717,7 +718,7 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_sub_f32_e32 v2, v2, v5
-; GFX1100-GISEL-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX1100-GISEL-NEXT: global_store_b96 v3, v[0:2], s[2:3]
; GFX1100-GISEL-NEXT: s_nop 0
; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-GISEL-NEXT: s_endpgm
@@ -888,7 +889,7 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-SDAG-LABEL: s_log2_v4f32:
; VI-SDAG: ; %bb.0:
; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-SDAG-NEXT: v_mov_b32_e32 v0, 0x800000
; VI-SDAG-NEXT: v_mov_b32_e32 v1, 0x42000000
; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x4f800000
@@ -915,10 +916,10 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; VI-SDAG-NEXT: v_log_f32_e32 v9, v1
; VI-SDAG-NEXT: v_sub_f32_e32 v3, v4, v2
; VI-SDAG-NEXT: v_sub_f32_e32 v2, v6, v5
-; VI-SDAG-NEXT: v_mov_b32_e32 v5, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v5, s3
; VI-SDAG-NEXT: v_sub_f32_e32 v1, v8, v7
; VI-SDAG-NEXT: v_sub_f32_e32 v0, v9, v0
-; VI-SDAG-NEXT: v_mov_b32_e32 v4, s0
+; VI-SDAG-NEXT: v_mov_b32_e32 v4, s2
; VI-SDAG-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-SDAG-NEXT: s_endpgm
;
@@ -1033,9 +1034,7 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
;
; GFX1100-SDAG-LABEL: s_log2_v4f32:
; GFX1100-SDAG: ; %bb.0:
-; GFX1100-SDAG-NEXT: s_clause 0x1
; GFX1100-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX1100-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s7
; GFX1100-SDAG-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s6
@@ -1048,23 +1047,24 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v6, 1.0, 0x4f800000, s8
; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v7, 1.0, 0x4f800000, s9
; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x42000000, s2
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s3
+; GFX1100-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-SDAG-NEXT: v_dual_mul_f32 v2, s7, v2 :: v_dual_mul_f32 v3, s6, v3
; GFX1100-SDAG-NEXT: v_dual_mul_f32 v6, s5, v6 :: v_dual_mul_f32 v7, s4, v7
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, s3
+; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8
; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v2, v2
; GFX1100-SDAG-NEXT: v_log_f32_e32 v8, v3
-; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(TRANS32_DEP_3)
+; GFX1100-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3)
; GFX1100-SDAG-NEXT: v_log_f32_e32 v6, v6
; GFX1100-SDAG-NEXT: v_log_f32_e32 v7, v7
-; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8
; GFX1100-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s9
; GFX1100-SDAG-NEXT: v_mov_b32_e32 v9, 0
; GFX1100-SDAG-NEXT: v_dual_sub_f32 v3, v2, v0 :: v_dual_sub_f32 v2, v8, v1
; GFX1100-SDAG-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-SDAG-NEXT: v_dual_sub_f32 v1, v6, v4 :: v_dual_sub_f32 v0, v7, v5
-; GFX1100-SDAG-NEXT: global_store_b128 v9, v[0:3], s[0:1]
+; GFX1100-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1100-SDAG-NEXT: global_store_b128 v9, v[0:3], s[2:3]
; GFX1100-SDAG-NEXT: s_nop 0
; GFX1100-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-SDAG-NEXT: s_endpgm
@@ -1073,36 +1073,36 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
; GFX1100-GISEL: ; %bb.0:
; GFX1100-GISEL-NEXT: s_clause 0x1
; GFX1100-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX1100-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1100-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX1100-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s2, 0x800000, s4
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s3, 0x800000, s5
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s6
-; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s7
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s8, 0x800000, s4
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s9, 0x800000, s5
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s10, 0x800000, s6
+; GFX1100-GISEL-NEXT: v_cmp_gt_f32_e64 s11, 0x800000, s7
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s2
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v0, 1.0, 0x4f800000, s8
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, s9
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s8
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s9
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s2
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, s10
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v3, 1.0, 0x4f800000, s11
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 0x42000000, s8
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-GISEL-NEXT: v_dual_mul_f32 v0, s4, v0 :: v_dual_mul_f32 v1, s5, v1
; GFX1100-GISEL-NEXT: v_dual_mul_f32 v2, s6, v2 :: v_dual_mul_f32 v3, s7, v3
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s3
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 0x42000000, s9
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v0, v0
; GFX1100-GISEL-NEXT: v_log_f32_e32 v1, v1
; GFX1100-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(TRANS32_DEP_3)
; GFX1100-GISEL-NEXT: v_log_f32_e32 v2, v2
; GFX1100-GISEL-NEXT: v_log_f32_e32 v3, v3
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x42000000, s8
-; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 0x42000000, s9
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 0x42000000, s10
+; GFX1100-GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 0x42000000, s11
; GFX1100-GISEL-NEXT: v_dual_sub_f32 v0, v0, v4 :: v_dual_sub_f32 v1, v1, v5
; GFX1100-GISEL-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-GISEL-NEXT: v_dual_sub_f32 v2, v2, v6 :: v_dual_sub_f32 v3, v3, v7
; GFX1100-GISEL-NEXT: v_mov_b32_e32 v4, 0
-; GFX1100-GISEL-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX1100-GISEL-NEXT: global_store_b128 v4, v[0:3], s[2:3]
; GFX1100-GISEL-NEXT: s_nop 0
; GFX1100-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1100-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
index d056a97dc5444..b8065d2cb0cb2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
@@ -117,27 +117,27 @@ define amdgpu_kernel void @maxnum_f16(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
; GFX11-NEXT: v_max_f16_e32 v1, v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_max_f16_e32 v0, v0, v1
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -175,79 +175,79 @@ define amdgpu_kernel void @maxnum_f16_imm_a(
;
; VI-LABEL: maxnum_f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: v_max_f16_e32 v0, 0x4200, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: maxnum_f16_imm_a:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
; GFX9-NEXT: v_max_f16_e32 v0, 0x4200, v0
-; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: maxnum_f16_imm_a:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s10, s6
-; GFX10-NEXT: s_mov_b32 s11, s7
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s10, s2
+; GFX10-NEXT: s_mov_b32 s11, s3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
-; GFX10-NEXT: s_mov_b32 s4, s0
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s0, s4
; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_mov_b32 s1, s5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
; GFX10-NEXT: v_max_f16_e32 v0, 0x4200, v0
-; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: maxnum_f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_max_f16_e32 v0, 0x4200, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -283,79 +283,79 @@ define amdgpu_kernel void @maxnum_f16_imm_b(
;
; VI-LABEL: maxnum_f16_imm_b:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: v_max_f16_e32 v0, 4.0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: maxnum_f16_imm_b:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
; GFX9-NEXT: v_max_f16_e32 v0, 4.0, v0
-; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: maxnum_f16_imm_b:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s10, s6
-; GFX10-NEXT: s_mov_b32 s11, s7
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s10, s2
+; GFX10-NEXT: s_mov_b32 s11, s3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
-; GFX10-NEXT: s_mov_b32 s4, s0
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s0, s4
; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_mov_b32 s1, s5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
; GFX10-NEXT: v_max_f16_e32 v0, 4.0, v0
-; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: maxnum_f16_imm_b:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_max_f16_e32 v0, 4.0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -396,26 +396,26 @@ define amdgpu_kernel void @maxnum_v2f16(
;
; VI-LABEL: maxnum_v2f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s8, s[4:5], 0x0
-; VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dword s8, s[8:9], 0x0
+; VI-NEXT: s_load_dword s6, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_max_f16_e64 v0, s8, s8
-; VI-NEXT: v_max_f16_e64 v1, s2, s2
-; VI-NEXT: s_lshr_b32 s0, s8, 16
+; VI-NEXT: v_max_f16_e64 v1, s6, s6
+; VI-NEXT: s_lshr_b32 s4, s8, 16
; VI-NEXT: v_max_f16_e32 v0, v1, v0
-; VI-NEXT: v_max_f16_e64 v1, s0, s0
-; VI-NEXT: s_lshr_b32 s0, s2, 16
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
+; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: s_lshr_b32 s4, s6, 16
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: maxnum_v2f16:
@@ -456,19 +456,19 @@ define amdgpu_kernel void @maxnum_v2f16(
; GFX11-LABEL: maxnum_v2f16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v0, s4, s4
-; GFX11-NEXT: v_pk_max_f16 v1, s2, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: v_pk_max_f16 v0, s0, s0
+; GFX11-NEXT: v_pk_max_f16 v1, s1, s1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v0, v1, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -506,61 +506,61 @@ define amdgpu_kernel void @maxnum_v2f16_imm_a(
;
; VI-LABEL: maxnum_v2f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x4400
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s4, s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_max_f16_e64 v0, s4, s4
-; VI-NEXT: s_lshr_b32 s4, s4, 16
-; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: v_max_f16_e64 v0, s0, s0
+; VI-NEXT: s_lshr_b32 s0, s0, 16
+; VI-NEXT: v_max_f16_e64 v1, s0, s0
; VI-NEXT: v_max_f16_e32 v0, 0x4200, v0
; VI-NEXT: v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: maxnum_v2f16_imm_a:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v0, s4, s4
-; GFX9-NEXT: s_mov_b32 s4, 0x44004200
-; GFX9-NEXT: v_pk_max_f16 v0, v0, s4
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: v_pk_max_f16 v0, s0, s0
+; GFX9-NEXT: s_mov_b32 s0, 0x44004200
+; GFX9-NEXT: v_pk_max_f16 v0, v0, s0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: maxnum_v2f16_imm_a:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v0, s2, s2
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: v_pk_max_f16 v0, s0, s0
; GFX10-NEXT: v_pk_max_f16 v0, 0x44004200, v0
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: maxnum_v2f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v0, s2, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: v_pk_max_f16 v0, s0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v0, 0x44004200, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -596,61 +596,61 @@ define amdgpu_kernel void @maxnum_v2f16_imm_b(
;
; VI-LABEL: maxnum_v2f16_imm_b:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x4200
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s4, s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_max_f16_e64 v0, s4, s4
-; VI-NEXT: s_lshr_b32 s4, s4, 16
-; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: v_max_f16_e64 v0, s0, s0
+; VI-NEXT: s_lshr_b32 s0, s0, 16
+; VI-NEXT: v_max_f16_e64 v1, s0, s0
; VI-NEXT: v_max_f16_e32 v0, 4.0, v0
; VI-NEXT: v_max_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: maxnum_v2f16_imm_b:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v0, s4, s4
-; GFX9-NEXT: s_mov_b32 s4, 0x42004400
-; GFX9-NEXT: v_pk_max_f16 v0, v0, s4
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: v_pk_max_f16 v0, s0, s0
+; GFX9-NEXT: s_mov_b32 s0, 0x42004400
+; GFX9-NEXT: v_pk_max_f16 v0, v0, s0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: maxnum_v2f16_imm_b:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v0, s2, s2
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: v_pk_max_f16 v0, s0, s0
; GFX10-NEXT: v_pk_max_f16 v0, 0x42004400, v0
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: maxnum_v2f16_imm_b:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v0, s2, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: v_pk_max_f16 v0, s0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_max_f16 v0, 0x42004400, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -697,30 +697,30 @@ define amdgpu_kernel void @maxnum_v3f16(
;
; VI-LABEL: maxnum_v3f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
+; VI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_max_f16_e64 v0, s8, s8
-; VI-NEXT: v_max_f16_e64 v1, s2, s2
-; VI-NEXT: s_lshr_b32 s0, s8, 16
+; VI-NEXT: v_max_f16_e64 v1, s6, s6
+; VI-NEXT: s_lshr_b32 s4, s8, 16
; VI-NEXT: v_max_f16_e32 v0, v1, v0
-; VI-NEXT: v_max_f16_e64 v1, s0, s0
-; VI-NEXT: s_lshr_b32 s0, s2, 16
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
+; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: s_lshr_b32 s4, s6, 16
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: v_max_f16_e64 v1, s9, s9
-; VI-NEXT: v_max_f16_e64 v2, s3, s3
+; VI-NEXT: v_max_f16_e64 v2, s7, s7
; VI-NEXT: v_max_f16_e32 v1, v2, v1
-; VI-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: maxnum_v3f16:
@@ -769,24 +769,24 @@ define amdgpu_kernel void @maxnum_v3f16(
; GFX11-LABEL: maxnum_v3f16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
-; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-NEXT: s_load_b64 s[2:3], s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v1, s5, s5
+; GFX11-NEXT: v_pk_max_f16 v1, s1, s1
; GFX11-NEXT: v_pk_max_f16 v2, s3, s3
-; GFX11-NEXT: v_pk_max_f16 v0, s4, s4
+; GFX11-NEXT: v_pk_max_f16 v0, s0, s0
; GFX11-NEXT: v_pk_max_f16 v3, s2, s2
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_pk_max_f16 v1, v2, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_pk_max_f16 v0, v3, v0
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 offset:4
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b16 v1, off, s[4:7], 0 offset:4
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -843,35 +843,35 @@ define amdgpu_kernel void @maxnum_v4f16(
;
; VI-LABEL: maxnum_v4f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
+; VI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_max_f16_e64 v0, s9, s9
-; VI-NEXT: v_max_f16_e64 v1, s3, s3
-; VI-NEXT: s_lshr_b32 s0, s9, 16
+; VI-NEXT: v_max_f16_e64 v1, s7, s7
+; VI-NEXT: s_lshr_b32 s4, s9, 16
; VI-NEXT: v_max_f16_e32 v0, v1, v0
-; VI-NEXT: v_max_f16_e64 v1, s0, s0
-; VI-NEXT: s_lshr_b32 s0, s3, 16
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
+; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: s_lshr_b32 s4, s7, 16
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
; VI-NEXT: v_max_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v1, v0, v1
; VI-NEXT: v_max_f16_e64 v0, s8, s8
-; VI-NEXT: v_max_f16_e64 v2, s2, s2
-; VI-NEXT: s_lshr_b32 s0, s8, 16
+; VI-NEXT: v_max_f16_e64 v2, s6, s6
+; VI-NEXT: s_lshr_b32 s4, s8, 16
; VI-NEXT: v_max_f16_e32 v0, v2, v0
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
-; VI-NEXT: s_lshr_b32 s0, s2, 16
-; VI-NEXT: v_max_f16_e64 v3, s0, s0
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
+; VI-NEXT: s_lshr_b32 s4, s6, 16
+; VI-NEXT: v_max_f16_e64 v3, s4, s4
; VI-NEXT: v_max_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v2
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: maxnum_v4f16:
@@ -918,22 +918,22 @@ define amdgpu_kernel void @maxnum_v4f16(
; GFX11-LABEL: maxnum_v4f16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
-; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-NEXT: s_load_b64 s[2:3], s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v0, s5, s5
+; GFX11-NEXT: v_pk_max_f16 v0, s1, s1
; GFX11-NEXT: v_pk_max_f16 v1, s3, s3
-; GFX11-NEXT: v_pk_max_f16 v2, s4, s4
+; GFX11-NEXT: v_pk_max_f16 v2, s0, s0
; GFX11-NEXT: v_pk_max_f16 v3, s2, s2
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_pk_max_f16 v1, v1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_pk_max_f16 v0, v3, v2
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -980,79 +980,79 @@ define amdgpu_kernel void @fmax_v4f16_imm_a(
;
; VI-LABEL: fmax_v4f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x4400
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s0, s3, 16
-; VI-NEXT: v_max_f16_e64 v1, s3, s3
-; VI-NEXT: v_max_f16_e64 v3, s0, s0
-; VI-NEXT: v_max_f16_e64 v2, s2, s2
+; VI-NEXT: s_lshr_b32 s4, s7, 16
+; VI-NEXT: v_max_f16_e64 v1, s7, s7
+; VI-NEXT: v_max_f16_e64 v3, s4, s4
+; VI-NEXT: v_max_f16_e64 v2, s6, s6
; VI-NEXT: v_max_f16_e32 v1, 0x4200, v1
; VI-NEXT: v_max_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: s_lshr_b32 s0, s2, 16
+; VI-NEXT: s_lshr_b32 s4, s6, 16
; VI-NEXT: v_or_b32_e32 v1, v1, v0
; VI-NEXT: v_max_f16_e32 v0, 0x4800, v2
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
; VI-NEXT: v_mov_b32_e32 v3, 0x4000
; VI-NEXT: v_max_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v2
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fmax_v4f16_imm_a:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s8, 0x44004200
; GFX9-NEXT: s_mov_b32 s9, 0x40004800
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v0, s3, s3
-; GFX9-NEXT: v_pk_max_f16 v2, s2, s2
+; GFX9-NEXT: v_pk_max_f16 v0, s7, s7
+; GFX9-NEXT: v_pk_max_f16 v2, s6, s6
; GFX9-NEXT: v_pk_max_f16 v1, v0, s8
; GFX9-NEXT: v_pk_max_f16 v0, v2, s9
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: fmax_v4f16_imm_a:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v0, s3, s3
-; GFX10-NEXT: v_pk_max_f16 v2, s2, s2
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: v_pk_max_f16 v0, s1, s1
+; GFX10-NEXT: v_pk_max_f16 v2, s0, s0
; GFX10-NEXT: v_pk_max_f16 v1, 0x44004200, v0
; GFX10-NEXT: v_pk_max_f16 v0, 0x40004800, v2
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: fmax_v4f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v0, s3, s3
-; GFX11-NEXT: v_pk_max_f16 v2, s2, s2
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: v_pk_max_f16 v0, s1, s1
+; GFX11-NEXT: v_pk_max_f16 v2, s0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_pk_max_f16 v1, 0x44004200, v0
; GFX11-NEXT: v_pk_max_f16 v0, 0x40004800, v2
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
index f934a2de9247f..a78fc3a64ad7d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
@@ -117,26 +117,26 @@ define amdgpu_kernel void @minnum_f16_ieee(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
; GFX11-NEXT: v_max_f16_e32 v1, v1, v1
; GFX11-NEXT: v_min_f16_e32 v0, v0, v1
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -202,78 +202,78 @@ define amdgpu_kernel void @minnum_f16_imm_a(
;
; VI-LABEL: minnum_f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: v_min_f16_e32 v0, 0x4200, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: minnum_f16_imm_a:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
; GFX9-NEXT: v_min_f16_e32 v0, 0x4200, v0
-; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: minnum_f16_imm_a:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s10, s6
-; GFX10-NEXT: s_mov_b32 s11, s7
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s10, s2
+; GFX10-NEXT: s_mov_b32 s11, s3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
-; GFX10-NEXT: s_mov_b32 s4, s0
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s0, s4
; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_mov_b32 s1, s5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
; GFX10-NEXT: v_min_f16_e32 v0, 0x4200, v0
-; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: minnum_f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
; GFX11-NEXT: v_min_f16_e32 v0, 0x4200, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -309,78 +309,78 @@ define amdgpu_kernel void @minnum_f16_imm_b(
;
; VI-LABEL: minnum_f16_imm_b:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: v_min_f16_e32 v0, 4.0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: minnum_f16_imm_b:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
; GFX9-NEXT: v_min_f16_e32 v0, 4.0, v0
-; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: minnum_f16_imm_b:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s10, s6
-; GFX10-NEXT: s_mov_b32 s11, s7
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s10, s2
+; GFX10-NEXT: s_mov_b32 s11, s3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
-; GFX10-NEXT: s_mov_b32 s4, s0
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s0, s4
; GFX10-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_mov_b32 s1, s5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
; GFX10-NEXT: v_min_f16_e32 v0, 4.0, v0
-; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX10-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: minnum_f16_imm_b:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
; GFX11-NEXT: v_min_f16_e32 v0, 4.0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -421,26 +421,26 @@ define amdgpu_kernel void @minnum_v2f16_ieee(
;
; VI-LABEL: minnum_v2f16_ieee:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s8, s[4:5], 0x0
-; VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dword s8, s[8:9], 0x0
+; VI-NEXT: s_load_dword s6, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_max_f16_e64 v0, s8, s8
-; VI-NEXT: v_max_f16_e64 v1, s2, s2
-; VI-NEXT: s_lshr_b32 s0, s8, 16
+; VI-NEXT: v_max_f16_e64 v1, s6, s6
+; VI-NEXT: s_lshr_b32 s4, s8, 16
; VI-NEXT: v_min_f16_e32 v0, v1, v0
-; VI-NEXT: v_max_f16_e64 v1, s0, s0
-; VI-NEXT: s_lshr_b32 s0, s2, 16
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
+; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: s_lshr_b32 s4, s6, 16
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: minnum_v2f16_ieee:
@@ -481,18 +481,18 @@ define amdgpu_kernel void @minnum_v2f16_ieee(
; GFX11-LABEL: minnum_v2f16_ieee:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v0, s4, s4
-; GFX11-NEXT: v_pk_max_f16 v1, s2, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: v_pk_max_f16 v0, s0, s0
+; GFX11-NEXT: v_pk_max_f16 v1, s1, s1
; GFX11-NEXT: v_pk_min_f16 v0, v1, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -565,60 +565,60 @@ define amdgpu_kernel void @minnum_v2f16_imm_a(
;
; VI-LABEL: minnum_v2f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x4400
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s4, s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_max_f16_e64 v0, s4, s4
-; VI-NEXT: s_lshr_b32 s4, s4, 16
-; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: v_max_f16_e64 v0, s0, s0
+; VI-NEXT: s_lshr_b32 s0, s0, 16
+; VI-NEXT: v_max_f16_e64 v1, s0, s0
; VI-NEXT: v_min_f16_e32 v0, 0x4200, v0
; VI-NEXT: v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: minnum_v2f16_imm_a:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v0, s4, s4
-; GFX9-NEXT: s_mov_b32 s4, 0x44004200
-; GFX9-NEXT: v_pk_min_f16 v0, v0, s4
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: v_pk_max_f16 v0, s0, s0
+; GFX9-NEXT: s_mov_b32 s0, 0x44004200
+; GFX9-NEXT: v_pk_min_f16 v0, v0, s0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: minnum_v2f16_imm_a:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v0, s2, s2
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: v_pk_max_f16 v0, s0, s0
; GFX10-NEXT: v_pk_min_f16 v0, 0x44004200, v0
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: minnum_v2f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v0, s2, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: v_pk_max_f16 v0, s0, s0
; GFX11-NEXT: v_pk_min_f16 v0, 0x44004200, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -654,60 +654,60 @@ define amdgpu_kernel void @minnum_v2f16_imm_b(
;
; VI-LABEL: minnum_v2f16_imm_b:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v2, 0x4200
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s4, s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_max_f16_e64 v0, s4, s4
-; VI-NEXT: s_lshr_b32 s4, s4, 16
-; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: v_max_f16_e64 v0, s0, s0
+; VI-NEXT: s_lshr_b32 s0, s0, 16
+; VI-NEXT: v_max_f16_e64 v1, s0, s0
; VI-NEXT: v_min_f16_e32 v0, 4.0, v0
; VI-NEXT: v_min_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: minnum_v2f16_imm_b:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s4, s[2:3], 0x0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v0, s4, s4
-; GFX9-NEXT: s_mov_b32 s4, 0x42004400
-; GFX9-NEXT: v_pk_min_f16 v0, v0, s4
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: v_pk_max_f16 v0, s0, s0
+; GFX9-NEXT: s_mov_b32 s0, 0x42004400
+; GFX9-NEXT: v_pk_min_f16 v0, v0, s0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: minnum_v2f16_imm_b:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v0, s2, s2
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: v_pk_max_f16 v0, s0, s0
; GFX10-NEXT: v_pk_min_f16 v0, 0x42004400, v0
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: minnum_v2f16_imm_b:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v0, s2, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: v_pk_max_f16 v0, s0, s0
; GFX11-NEXT: v_pk_min_f16 v0, 0x42004400, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -754,30 +754,30 @@ define amdgpu_kernel void @minnum_v3f16(
;
; VI-LABEL: minnum_v3f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
+; VI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_max_f16_e64 v0, s8, s8
-; VI-NEXT: v_max_f16_e64 v1, s2, s2
-; VI-NEXT: s_lshr_b32 s0, s8, 16
+; VI-NEXT: v_max_f16_e64 v1, s6, s6
+; VI-NEXT: s_lshr_b32 s4, s8, 16
; VI-NEXT: v_min_f16_e32 v0, v1, v0
-; VI-NEXT: v_max_f16_e64 v1, s0, s0
-; VI-NEXT: s_lshr_b32 s0, s2, 16
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
+; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: s_lshr_b32 s4, s6, 16
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: v_max_f16_e64 v1, s9, s9
-; VI-NEXT: v_max_f16_e64 v2, s3, s3
+; VI-NEXT: v_max_f16_e64 v2, s7, s7
; VI-NEXT: v_min_f16_e32 v1, v2, v1
-; VI-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: minnum_v3f16:
@@ -826,23 +826,23 @@ define amdgpu_kernel void @minnum_v3f16(
; GFX11-LABEL: minnum_v3f16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
-; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-NEXT: s_load_b64 s[2:3], s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v1, s5, s5
+; GFX11-NEXT: v_pk_max_f16 v1, s1, s1
; GFX11-NEXT: v_pk_max_f16 v2, s3, s3
-; GFX11-NEXT: v_pk_max_f16 v0, s4, s4
+; GFX11-NEXT: v_pk_max_f16 v0, s0, s0
; GFX11-NEXT: v_pk_max_f16 v3, s2, s2
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: v_pk_min_f16 v1, v2, v1
; GFX11-NEXT: v_pk_min_f16 v0, v3, v0
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0 offset:4
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b16 v1, off, s[4:7], 0 offset:4
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -899,35 +899,35 @@ define amdgpu_kernel void @minnum_v4f16(
;
; VI-LABEL: minnum_v4f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
+; VI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_max_f16_e64 v0, s9, s9
-; VI-NEXT: v_max_f16_e64 v1, s3, s3
-; VI-NEXT: s_lshr_b32 s0, s9, 16
+; VI-NEXT: v_max_f16_e64 v1, s7, s7
+; VI-NEXT: s_lshr_b32 s4, s9, 16
; VI-NEXT: v_min_f16_e32 v0, v1, v0
-; VI-NEXT: v_max_f16_e64 v1, s0, s0
-; VI-NEXT: s_lshr_b32 s0, s3, 16
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
+; VI-NEXT: v_max_f16_e64 v1, s4, s4
+; VI-NEXT: s_lshr_b32 s4, s7, 16
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
; VI-NEXT: v_min_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v1, v0, v1
; VI-NEXT: v_max_f16_e64 v0, s8, s8
-; VI-NEXT: v_max_f16_e64 v2, s2, s2
-; VI-NEXT: s_lshr_b32 s0, s8, 16
+; VI-NEXT: v_max_f16_e64 v2, s6, s6
+; VI-NEXT: s_lshr_b32 s4, s8, 16
; VI-NEXT: v_min_f16_e32 v0, v2, v0
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
-; VI-NEXT: s_lshr_b32 s0, s2, 16
-; VI-NEXT: v_max_f16_e64 v3, s0, s0
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
+; VI-NEXT: s_lshr_b32 s4, s6, 16
+; VI-NEXT: v_max_f16_e64 v3, s4, s4
; VI-NEXT: v_min_f16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v2
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: minnum_v4f16:
@@ -974,21 +974,21 @@ define amdgpu_kernel void @minnum_v4f16(
; GFX11-LABEL: minnum_v4f16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x0
-; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-NEXT: s_load_b64 s[2:3], s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v0, s5, s5
+; GFX11-NEXT: v_pk_max_f16 v0, s1, s1
; GFX11-NEXT: v_pk_max_f16 v1, s3, s3
-; GFX11-NEXT: v_pk_max_f16 v2, s4, s4
+; GFX11-NEXT: v_pk_max_f16 v2, s0, s0
; GFX11-NEXT: v_pk_max_f16 v3, s2, s2
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: v_pk_min_f16 v1, v1, v0
; GFX11-NEXT: v_pk_min_f16 v0, v3, v2
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1035,78 +1035,78 @@ define amdgpu_kernel void @fmin_v4f16_imm_a(
;
; VI-LABEL: fmin_v4f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x4400
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s0, s3, 16
-; VI-NEXT: v_max_f16_e64 v1, s3, s3
-; VI-NEXT: v_max_f16_e64 v3, s0, s0
-; VI-NEXT: v_max_f16_e64 v2, s2, s2
+; VI-NEXT: s_lshr_b32 s4, s7, 16
+; VI-NEXT: v_max_f16_e64 v1, s7, s7
+; VI-NEXT: v_max_f16_e64 v3, s4, s4
+; VI-NEXT: v_max_f16_e64 v2, s6, s6
; VI-NEXT: v_min_f16_e32 v1, 0x4200, v1
; VI-NEXT: v_min_f16_sdwa v0, v3, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: s_lshr_b32 s0, s2, 16
+; VI-NEXT: s_lshr_b32 s4, s6, 16
; VI-NEXT: v_or_b32_e32 v1, v1, v0
; VI-NEXT: v_min_f16_e32 v0, 0x4800, v2
-; VI-NEXT: v_max_f16_e64 v2, s0, s0
+; VI-NEXT: v_max_f16_e64 v2, s4, s4
; VI-NEXT: v_mov_b32_e32 v3, 0x4000
; VI-NEXT: v_min_f16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v2
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: fmin_v4f16_imm_a:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s8, 0x44004200
; GFX9-NEXT: s_mov_b32 s9, 0x40004800
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_pk_max_f16 v0, s3, s3
-; GFX9-NEXT: v_pk_max_f16 v2, s2, s2
+; GFX9-NEXT: v_pk_max_f16 v0, s7, s7
+; GFX9-NEXT: v_pk_max_f16 v2, s6, s6
; GFX9-NEXT: v_pk_min_f16 v1, v0, s8
; GFX9-NEXT: v_pk_min_f16 v0, v2, s9
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: fmin_v4f16_imm_a:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_pk_max_f16 v0, s3, s3
-; GFX10-NEXT: v_pk_max_f16 v2, s2, s2
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: v_pk_max_f16 v0, s1, s1
+; GFX10-NEXT: v_pk_max_f16 v2, s0, s0
; GFX10-NEXT: v_pk_min_f16 v1, 0x44004200, v0
; GFX10-NEXT: v_pk_min_f16 v0, 0x40004800, v2
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: fmin_v4f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX11-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_max_f16 v0, s3, s3
-; GFX11-NEXT: v_pk_max_f16 v2, s2, s2
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: v_pk_max_f16 v0, s1, s1
+; GFX11-NEXT: v_pk_max_f16 v2, s0, s0
; GFX11-NEXT: v_pk_min_f16 v1, 0x44004200, v0
; GFX11-NEXT: v_pk_min_f16 v0, 0x40004800, v2
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
index c3e665fa8269a..1423575dab106 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll
@@ -365,57 +365,57 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
;
; GFX9-LABEL: umulo_i64_s:
; GFX9: ; %bb.0: ; %bb
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mul_i32 s7, s0, s3
-; GFX9-NEXT: s_mul_hi_u32 s8, s0, s2
-; GFX9-NEXT: s_mul_hi_u32 s5, s0, s3
-; GFX9-NEXT: s_add_u32 s9, s8, s7
-; GFX9-NEXT: s_mul_i32 s6, s1, s2
-; GFX9-NEXT: s_addc_u32 s5, 0, s5
-; GFX9-NEXT: s_mul_hi_u32 s4, s1, s2
-; GFX9-NEXT: s_add_u32 s9, s9, s6
-; GFX9-NEXT: s_mul_hi_u32 s10, s1, s3
-; GFX9-NEXT: s_addc_u32 s4, s5, s4
-; GFX9-NEXT: s_addc_u32 s5, s10, 0
-; GFX9-NEXT: s_mul_i32 s1, s1, s3
-; GFX9-NEXT: s_add_u32 s4, s4, s1
-; GFX9-NEXT: s_addc_u32 s5, 0, s5
-; GFX9-NEXT: s_add_i32 s1, s8, s7
-; GFX9-NEXT: s_add_i32 s1, s1, s6
-; GFX9-NEXT: s_mul_i32 s0, s0, s2
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT: s_cselect_b32 s1, 0, s1
-; GFX9-NEXT: s_cselect_b32 s0, 0, s0
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_mul_i32 s3, s4, s7
+; GFX9-NEXT: s_mul_hi_u32 s8, s4, s6
+; GFX9-NEXT: s_mul_hi_u32 s1, s4, s7
+; GFX9-NEXT: s_add_u32 s9, s8, s3
+; GFX9-NEXT: s_mul_i32 s2, s5, s6
+; GFX9-NEXT: s_addc_u32 s1, 0, s1
+; GFX9-NEXT: s_mul_hi_u32 s0, s5, s6
+; GFX9-NEXT: s_add_u32 s9, s9, s2
+; GFX9-NEXT: s_mul_hi_u32 s10, s5, s7
+; GFX9-NEXT: s_addc_u32 s0, s1, s0
+; GFX9-NEXT: s_addc_u32 s1, s10, 0
+; GFX9-NEXT: s_mul_i32 s5, s5, s7
+; GFX9-NEXT: s_add_u32 s0, s0, s5
+; GFX9-NEXT: s_addc_u32 s1, 0, s1
+; GFX9-NEXT: s_add_i32 s3, s8, s3
+; GFX9-NEXT: s_add_i32 s3, s3, s2
+; GFX9-NEXT: s_mul_i32 s2, s4, s6
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX9-NEXT: s_cselect_b32 s0, 0, s3
+; GFX9-NEXT: s_cselect_b32 s1, 0, s2
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: umulo_i64_s:
; GFX10: ; %bb.0: ; %bb
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mul_i32 s7, s0, s3
-; GFX10-NEXT: s_mul_hi_u32 s8, s0, s2
-; GFX10-NEXT: s_mul_hi_u32 s5, s0, s3
-; GFX10-NEXT: s_mul_hi_u32 s4, s1, s2
-; GFX10-NEXT: s_mul_i32 s6, s1, s2
-; GFX10-NEXT: s_mul_hi_u32 s9, s1, s3
-; GFX10-NEXT: s_mul_i32 s1, s1, s3
-; GFX10-NEXT: s_add_u32 s3, s8, s7
-; GFX10-NEXT: s_addc_u32 s5, 0, s5
-; GFX10-NEXT: s_add_u32 s3, s3, s6
-; GFX10-NEXT: s_addc_u32 s3, s5, s4
-; GFX10-NEXT: s_addc_u32 s5, s9, 0
-; GFX10-NEXT: s_add_u32 s4, s3, s1
-; GFX10-NEXT: s_addc_u32 s5, 0, s5
-; GFX10-NEXT: s_add_i32 s1, s8, s7
-; GFX10-NEXT: s_mul_i32 s0, s0, s2
-; GFX10-NEXT: s_add_i32 s1, s1, s6
-; GFX10-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX10-NEXT: s_cselect_b32 s0, 0, s0
-; GFX10-NEXT: s_cselect_b32 s1, 0, s1
+; GFX10-NEXT: s_mul_i32 s3, s4, s7
+; GFX10-NEXT: s_mul_hi_u32 s8, s4, s6
+; GFX10-NEXT: s_mul_hi_u32 s1, s4, s7
+; GFX10-NEXT: s_mul_hi_u32 s0, s5, s6
+; GFX10-NEXT: s_mul_i32 s2, s5, s6
+; GFX10-NEXT: s_mul_hi_u32 s9, s5, s7
+; GFX10-NEXT: s_mul_i32 s5, s5, s7
+; GFX10-NEXT: s_add_u32 s7, s8, s3
+; GFX10-NEXT: s_addc_u32 s1, 0, s1
+; GFX10-NEXT: s_add_u32 s7, s7, s2
+; GFX10-NEXT: s_addc_u32 s0, s1, s0
+; GFX10-NEXT: s_addc_u32 s1, s9, 0
+; GFX10-NEXT: s_add_u32 s0, s0, s5
+; GFX10-NEXT: s_addc_u32 s1, 0, s1
+; GFX10-NEXT: s_add_i32 s3, s8, s3
+; GFX10-NEXT: s_mul_i32 s4, s4, s6
+; GFX10-NEXT: s_add_i32 s3, s3, s2
+; GFX10-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX10-NEXT: s_cselect_b32 s0, 0, s4
+; GFX10-NEXT: s_cselect_b32 s1, 0, s3
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
@@ -423,28 +423,28 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
;
; GFX11-LABEL: umulo_i64_s:
; GFX11: ; %bb.0: ; %bb
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mul_i32 s7, s0, s3
-; GFX11-NEXT: s_mul_hi_u32 s8, s0, s2
-; GFX11-NEXT: s_mul_hi_u32 s5, s0, s3
-; GFX11-NEXT: s_mul_hi_u32 s4, s1, s2
-; GFX11-NEXT: s_mul_i32 s6, s1, s2
-; GFX11-NEXT: s_mul_hi_u32 s9, s1, s3
-; GFX11-NEXT: s_mul_i32 s1, s1, s3
-; GFX11-NEXT: s_add_u32 s3, s8, s7
-; GFX11-NEXT: s_addc_u32 s5, 0, s5
-; GFX11-NEXT: s_add_u32 s3, s3, s6
-; GFX11-NEXT: s_addc_u32 s3, s5, s4
-; GFX11-NEXT: s_addc_u32 s5, s9, 0
-; GFX11-NEXT: s_add_u32 s4, s3, s1
-; GFX11-NEXT: s_addc_u32 s5, 0, s5
-; GFX11-NEXT: s_add_i32 s1, s8, s7
-; GFX11-NEXT: s_mul_i32 s0, s0, s2
-; GFX11-NEXT: s_add_i32 s1, s1, s6
-; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX11-NEXT: s_cselect_b32 s0, 0, s0
-; GFX11-NEXT: s_cselect_b32 s1, 0, s1
+; GFX11-NEXT: s_mul_i32 s3, s4, s7
+; GFX11-NEXT: s_mul_hi_u32 s8, s4, s6
+; GFX11-NEXT: s_mul_hi_u32 s1, s4, s7
+; GFX11-NEXT: s_mul_hi_u32 s0, s5, s6
+; GFX11-NEXT: s_mul_i32 s2, s5, s6
+; GFX11-NEXT: s_mul_hi_u32 s9, s5, s7
+; GFX11-NEXT: s_mul_i32 s5, s5, s7
+; GFX11-NEXT: s_add_u32 s7, s8, s3
+; GFX11-NEXT: s_addc_u32 s1, 0, s1
+; GFX11-NEXT: s_add_u32 s7, s7, s2
+; GFX11-NEXT: s_addc_u32 s0, s1, s0
+; GFX11-NEXT: s_addc_u32 s1, s9, 0
+; GFX11-NEXT: s_add_u32 s0, s0, s5
+; GFX11-NEXT: s_addc_u32 s1, 0, s1
+; GFX11-NEXT: s_add_i32 s3, s8, s3
+; GFX11-NEXT: s_mul_i32 s4, s4, s6
+; GFX11-NEXT: s_add_i32 s3, s3, s2
+; GFX11-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX11-NEXT: s_cselect_b32 s0, 0, s4
+; GFX11-NEXT: s_cselect_b32 s1, 0, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off
@@ -454,26 +454,26 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) {
;
; GFX12-LABEL: umulo_i64_s:
; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: s_mov_b32 s5, 0
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mul_hi_u32 s7, s0, s3
-; GFX12-NEXT: s_mul_i32 s6, s0, s3
-; GFX12-NEXT: s_mul_hi_u32 s4, s0, s2
-; GFX12-NEXT: s_mul_i32 s10, s1, s2
-; GFX12-NEXT: s_add_nc_u64 s[6:7], s[4:5], s[6:7]
-; GFX12-NEXT: s_mul_hi_u32 s9, s1, s2
-; GFX12-NEXT: s_mul_hi_u32 s11, s1, s3
-; GFX12-NEXT: s_add_co_u32 s4, s6, s10
-; GFX12-NEXT: s_add_co_ci_u32 s4, s7, s9
-; GFX12-NEXT: s_mul_i32 s8, s1, s3
+; GFX12-NEXT: s_mul_hi_u32 s3, s4, s7
+; GFX12-NEXT: s_mul_i32 s2, s4, s7
+; GFX12-NEXT: s_mul_hi_u32 s0, s4, s6
+; GFX12-NEXT: s_mul_i32 s10, s5, s6
+; GFX12-NEXT: s_add_nc_u64 s[2:3], s[0:1], s[2:3]
+; GFX12-NEXT: s_mul_hi_u32 s9, s5, s6
+; GFX12-NEXT: s_mul_hi_u32 s11, s5, s7
+; GFX12-NEXT: s_add_co_u32 s0, s2, s10
+; GFX12-NEXT: s_add_co_ci_u32 s0, s3, s9
+; GFX12-NEXT: s_mul_i32 s8, s5, s7
; GFX12-NEXT: s_add_co_ci_u32 s9, s11, 0
-; GFX12-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3]
-; GFX12-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[8:9]
+; GFX12-NEXT: s_mul_u64 s[2:3], s[4:5], s[6:7]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[8:9]
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX12-NEXT: s_cselect_b32 s0, 0, s0
-; GFX12-NEXT: s_cselect_b32 s1, 0, s1
+; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX12-NEXT: s_cselect_b32 s0, 0, s2
+; GFX12-NEXT: s_cselect_b32 s1, 0, s3
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: global_store_b64 v[0:1], v[0:1], off
; GFX12-NEXT: s_nop 0
@@ -540,81 +540,81 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
;
; GFX9-LABEL: smulo_i64_s:
; GFX9: ; %bb.0: ; %bb
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mul_i32 s7, s0, s3
-; GFX9-NEXT: s_mul_hi_u32 s8, s0, s2
-; GFX9-NEXT: s_mul_hi_u32 s5, s0, s3
-; GFX9-NEXT: s_add_u32 s9, s8, s7
-; GFX9-NEXT: s_mul_i32 s6, s1, s2
-; GFX9-NEXT: s_addc_u32 s5, 0, s5
-; GFX9-NEXT: s_mul_hi_u32 s4, s1, s2
-; GFX9-NEXT: s_add_u32 s9, s9, s6
-; GFX9-NEXT: s_mul_hi_i32 s10, s1, s3
-; GFX9-NEXT: s_addc_u32 s4, s5, s4
-; GFX9-NEXT: s_addc_u32 s5, s10, 0
-; GFX9-NEXT: s_mul_i32 s9, s1, s3
-; GFX9-NEXT: s_add_u32 s4, s4, s9
-; GFX9-NEXT: s_addc_u32 s5, 0, s5
-; GFX9-NEXT: s_sub_u32 s9, s4, s2
-; GFX9-NEXT: s_subb_u32 s10, s5, 0
-; GFX9-NEXT: s_cmp_lt_i32 s1, 0
-; GFX9-NEXT: s_cselect_b32 s4, s9, s4
-; GFX9-NEXT: s_cselect_b32 s1, s10, s5
-; GFX9-NEXT: s_sub_u32 s9, s4, s0
-; GFX9-NEXT: s_subb_u32 s5, s1, 0
-; GFX9-NEXT: s_cmp_lt_i32 s3, 0
-; GFX9-NEXT: s_cselect_b32 s5, s5, s1
-; GFX9-NEXT: s_cselect_b32 s4, s9, s4
-; GFX9-NEXT: s_add_i32 s1, s8, s7
-; GFX9-NEXT: s_add_i32 s1, s1, s6
-; GFX9-NEXT: s_ashr_i32 s6, s1, 31
-; GFX9-NEXT: s_mov_b32 s7, s6
-; GFX9-NEXT: s_mul_i32 s0, s0, s2
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], s[6:7]
-; GFX9-NEXT: s_cselect_b32 s1, 0, s1
-; GFX9-NEXT: s_cselect_b32 s0, 0, s0
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_mul_i32 s3, s4, s7
+; GFX9-NEXT: s_mul_hi_u32 s8, s4, s6
+; GFX9-NEXT: s_mul_hi_u32 s1, s4, s7
+; GFX9-NEXT: s_add_u32 s9, s8, s3
+; GFX9-NEXT: s_mul_i32 s2, s5, s6
+; GFX9-NEXT: s_addc_u32 s1, 0, s1
+; GFX9-NEXT: s_mul_hi_u32 s0, s5, s6
+; GFX9-NEXT: s_add_u32 s9, s9, s2
+; GFX9-NEXT: s_mul_hi_i32 s10, s5, s7
+; GFX9-NEXT: s_addc_u32 s0, s1, s0
+; GFX9-NEXT: s_addc_u32 s1, s10, 0
+; GFX9-NEXT: s_mul_i32 s9, s5, s7
+; GFX9-NEXT: s_add_u32 s0, s0, s9
+; GFX9-NEXT: s_addc_u32 s1, 0, s1
+; GFX9-NEXT: s_sub_u32 s9, s0, s6
+; GFX9-NEXT: s_subb_u32 s10, s1, 0
+; GFX9-NEXT: s_cmp_lt_i32 s5, 0
+; GFX9-NEXT: s_cselect_b32 s0, s9, s0
+; GFX9-NEXT: s_cselect_b32 s1, s10, s1
+; GFX9-NEXT: s_sub_u32 s5, s0, s4
+; GFX9-NEXT: s_subb_u32 s9, s1, 0
+; GFX9-NEXT: s_cmp_lt_i32 s7, 0
+; GFX9-NEXT: s_cselect_b32 s1, s9, s1
+; GFX9-NEXT: s_cselect_b32 s0, s5, s0
+; GFX9-NEXT: s_add_i32 s3, s8, s3
+; GFX9-NEXT: s_add_i32 s5, s3, s2
+; GFX9-NEXT: s_ashr_i32 s2, s5, 31
+; GFX9-NEXT: s_mov_b32 s3, s2
+; GFX9-NEXT: s_mul_i32 s4, s4, s6
+; GFX9-NEXT: s_cmp_lg_u64 s[0:1], s[2:3]
+; GFX9-NEXT: s_cselect_b32 s0, 0, s5
+; GFX9-NEXT: s_cselect_b32 s1, 0, s4
+; GFX9-NEXT: v_mov_b32_e32 v0, s1
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: smulo_i64_s:
; GFX10: ; %bb.0: ; %bb
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mul_i32 s7, s0, s3
-; GFX10-NEXT: s_mul_hi_u32 s8, s0, s2
-; GFX10-NEXT: s_mul_hi_u32 s5, s0, s3
-; GFX10-NEXT: s_mul_i32 s6, s1, s2
-; GFX10-NEXT: s_add_u32 s11, s8, s7
-; GFX10-NEXT: s_mul_hi_u32 s4, s1, s2
-; GFX10-NEXT: s_addc_u32 s5, 0, s5
-; GFX10-NEXT: s_mul_hi_i32 s9, s1, s3
-; GFX10-NEXT: s_add_u32 s11, s11, s6
-; GFX10-NEXT: s_mul_i32 s10, s1, s3
-; GFX10-NEXT: s_addc_u32 s4, s5, s4
-; GFX10-NEXT: s_addc_u32 s5, s9, 0
-; GFX10-NEXT: s_add_u32 s4, s4, s10
-; GFX10-NEXT: s_addc_u32 s5, 0, s5
-; GFX10-NEXT: s_sub_u32 s9, s4, s2
-; GFX10-NEXT: s_subb_u32 s10, s5, 0
-; GFX10-NEXT: s_cmp_lt_i32 s1, 0
-; GFX10-NEXT: s_cselect_b32 s1, s9, s4
-; GFX10-NEXT: s_cselect_b32 s4, s10, s5
-; GFX10-NEXT: s_sub_u32 s9, s1, s0
-; GFX10-NEXT: s_subb_u32 s5, s4, 0
-; GFX10-NEXT: s_cmp_lt_i32 s3, 0
-; GFX10-NEXT: s_mul_i32 s0, s0, s2
-; GFX10-NEXT: s_cselect_b32 s5, s5, s4
-; GFX10-NEXT: s_cselect_b32 s4, s9, s1
-; GFX10-NEXT: s_add_i32 s1, s8, s7
-; GFX10-NEXT: s_add_i32 s1, s1, s6
-; GFX10-NEXT: s_ashr_i32 s6, s1, 31
-; GFX10-NEXT: s_mov_b32 s7, s6
-; GFX10-NEXT: s_cmp_lg_u64 s[4:5], s[6:7]
-; GFX10-NEXT: s_cselect_b32 s0, 0, s0
-; GFX10-NEXT: s_cselect_b32 s1, 0, s1
+; GFX10-NEXT: s_mul_i32 s3, s4, s7
+; GFX10-NEXT: s_mul_hi_u32 s8, s4, s6
+; GFX10-NEXT: s_mul_hi_u32 s1, s4, s7
+; GFX10-NEXT: s_mul_i32 s2, s5, s6
+; GFX10-NEXT: s_add_u32 s11, s8, s3
+; GFX10-NEXT: s_mul_hi_u32 s0, s5, s6
+; GFX10-NEXT: s_addc_u32 s1, 0, s1
+; GFX10-NEXT: s_mul_hi_i32 s9, s5, s7
+; GFX10-NEXT: s_add_u32 s11, s11, s2
+; GFX10-NEXT: s_mul_i32 s10, s5, s7
+; GFX10-NEXT: s_addc_u32 s0, s1, s0
+; GFX10-NEXT: s_addc_u32 s1, s9, 0
+; GFX10-NEXT: s_add_u32 s0, s0, s10
+; GFX10-NEXT: s_addc_u32 s1, 0, s1
+; GFX10-NEXT: s_sub_u32 s9, s0, s6
+; GFX10-NEXT: s_subb_u32 s10, s1, 0
+; GFX10-NEXT: s_cmp_lt_i32 s5, 0
+; GFX10-NEXT: s_cselect_b32 s0, s9, s0
+; GFX10-NEXT: s_cselect_b32 s1, s10, s1
+; GFX10-NEXT: s_sub_u32 s5, s0, s4
+; GFX10-NEXT: s_subb_u32 s9, s1, 0
+; GFX10-NEXT: s_cmp_lt_i32 s7, 0
+; GFX10-NEXT: s_mul_i32 s4, s4, s6
+; GFX10-NEXT: s_cselect_b32 s1, s9, s1
+; GFX10-NEXT: s_cselect_b32 s0, s5, s0
+; GFX10-NEXT: s_add_i32 s3, s8, s3
+; GFX10-NEXT: s_add_i32 s5, s3, s2
+; GFX10-NEXT: s_ashr_i32 s2, s5, 31
+; GFX10-NEXT: s_mov_b32 s3, s2
+; GFX10-NEXT: s_cmp_lg_u64 s[0:1], s[2:3]
+; GFX10-NEXT: s_cselect_b32 s0, 0, s4
+; GFX10-NEXT: s_cselect_b32 s1, 0, s5
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
@@ -622,42 +622,42 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
;
; GFX11-LABEL: smulo_i64_s:
; GFX11: ; %bb.0: ; %bb
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mul_i32 s7, s0, s3
-; GFX11-NEXT: s_mul_hi_u32 s8, s0, s2
-; GFX11-NEXT: s_mul_hi_u32 s5, s0, s3
-; GFX11-NEXT: s_mul_i32 s6, s1, s2
-; GFX11-NEXT: s_add_u32 s11, s8, s7
-; GFX11-NEXT: s_mul_hi_u32 s4, s1, s2
-; GFX11-NEXT: s_addc_u32 s5, 0, s5
-; GFX11-NEXT: s_mul_hi_i32 s9, s1, s3
-; GFX11-NEXT: s_add_u32 s11, s11, s6
-; GFX11-NEXT: s_mul_i32 s10, s1, s3
-; GFX11-NEXT: s_addc_u32 s4, s5, s4
-; GFX11-NEXT: s_addc_u32 s5, s9, 0
-; GFX11-NEXT: s_add_u32 s4, s4, s10
-; GFX11-NEXT: s_addc_u32 s5, 0, s5
-; GFX11-NEXT: s_sub_u32 s9, s4, s2
-; GFX11-NEXT: s_subb_u32 s10, s5, 0
-; GFX11-NEXT: s_cmp_lt_i32 s1, 0
-; GFX11-NEXT: s_cselect_b32 s1, s9, s4
-; GFX11-NEXT: s_cselect_b32 s4, s10, s5
-; GFX11-NEXT: s_sub_u32 s9, s1, s0
-; GFX11-NEXT: s_subb_u32 s5, s4, 0
-; GFX11-NEXT: s_cmp_lt_i32 s3, 0
-; GFX11-NEXT: s_mul_i32 s0, s0, s2
-; GFX11-NEXT: s_cselect_b32 s5, s5, s4
-; GFX11-NEXT: s_cselect_b32 s4, s9, s1
-; GFX11-NEXT: s_add_i32 s1, s8, s7
+; GFX11-NEXT: s_mul_i32 s3, s4, s7
+; GFX11-NEXT: s_mul_hi_u32 s8, s4, s6
+; GFX11-NEXT: s_mul_hi_u32 s1, s4, s7
+; GFX11-NEXT: s_mul_i32 s2, s5, s6
+; GFX11-NEXT: s_add_u32 s11, s8, s3
+; GFX11-NEXT: s_mul_hi_u32 s0, s5, s6
+; GFX11-NEXT: s_addc_u32 s1, 0, s1
+; GFX11-NEXT: s_mul_hi_i32 s9, s5, s7
+; GFX11-NEXT: s_add_u32 s11, s11, s2
+; GFX11-NEXT: s_mul_i32 s10, s5, s7
+; GFX11-NEXT: s_addc_u32 s0, s1, s0
+; GFX11-NEXT: s_addc_u32 s1, s9, 0
+; GFX11-NEXT: s_add_u32 s0, s0, s10
+; GFX11-NEXT: s_addc_u32 s1, 0, s1
+; GFX11-NEXT: s_sub_u32 s9, s0, s6
+; GFX11-NEXT: s_subb_u32 s10, s1, 0
+; GFX11-NEXT: s_cmp_lt_i32 s5, 0
+; GFX11-NEXT: s_cselect_b32 s0, s9, s0
+; GFX11-NEXT: s_cselect_b32 s1, s10, s1
+; GFX11-NEXT: s_sub_u32 s5, s0, s4
+; GFX11-NEXT: s_subb_u32 s9, s1, 0
+; GFX11-NEXT: s_cmp_lt_i32 s7, 0
+; GFX11-NEXT: s_mul_i32 s4, s4, s6
+; GFX11-NEXT: s_cselect_b32 s1, s9, s1
+; GFX11-NEXT: s_cselect_b32 s0, s5, s0
+; GFX11-NEXT: s_add_i32 s3, s8, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_add_i32 s1, s1, s6
-; GFX11-NEXT: s_ashr_i32 s6, s1, 31
+; GFX11-NEXT: s_add_i32 s5, s3, s2
+; GFX11-NEXT: s_ashr_i32 s2, s5, 31
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_mov_b32 s7, s6
-; GFX11-NEXT: s_cmp_lg_u64 s[4:5], s[6:7]
-; GFX11-NEXT: s_cselect_b32 s0, 0, s0
-; GFX11-NEXT: s_cselect_b32 s1, 0, s1
+; GFX11-NEXT: s_mov_b32 s3, s2
+; GFX11-NEXT: s_cmp_lg_u64 s[0:1], s[2:3]
+; GFX11-NEXT: s_cselect_b32 s0, 0, s4
+; GFX11-NEXT: s_cselect_b32 s1, 0, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: global_store_b64 v[0:1], v[0:1], off
@@ -667,39 +667,39 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) {
;
; GFX12-LABEL: smulo_i64_s:
; GFX12: ; %bb.0: ; %bb
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: s_mov_b32 s5, 0
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mul_hi_u32 s7, s0, s3
-; GFX12-NEXT: s_mul_i32 s6, s0, s3
-; GFX12-NEXT: s_mul_hi_u32 s4, s0, s2
-; GFX12-NEXT: s_mul_i32 s10, s1, s2
-; GFX12-NEXT: s_add_nc_u64 s[6:7], s[4:5], s[6:7]
-; GFX12-NEXT: s_mul_hi_u32 s9, s1, s2
-; GFX12-NEXT: s_mul_hi_i32 s11, s1, s3
-; GFX12-NEXT: s_add_co_u32 s4, s6, s10
-; GFX12-NEXT: s_add_co_ci_u32 s4, s7, s9
-; GFX12-NEXT: s_mul_i32 s8, s1, s3
+; GFX12-NEXT: s_mul_hi_u32 s3, s4, s7
+; GFX12-NEXT: s_mul_i32 s2, s4, s7
+; GFX12-NEXT: s_mul_hi_u32 s0, s4, s6
+; GFX12-NEXT: s_mul_i32 s10, s5, s6
+; GFX12-NEXT: s_add_nc_u64 s[2:3], s[0:1], s[2:3]
+; GFX12-NEXT: s_mul_hi_u32 s9, s5, s6
+; GFX12-NEXT: s_mul_hi_i32 s11, s5, s7
+; GFX12-NEXT: s_add_co_u32 s0, s2, s10
+; GFX12-NEXT: s_add_co_ci_u32 s0, s3, s9
+; GFX12-NEXT: s_mul_i32 s8, s5, s7
; GFX12-NEXT: s_add_co_ci_u32 s9, s11, 0
-; GFX12-NEXT: s_cmp_lt_i32 s1, 0
-; GFX12-NEXT: s_add_nc_u64 s[6:7], s[4:5], s[8:9]
-; GFX12-NEXT: s_mov_b32 s4, s2
+; GFX12-NEXT: s_cmp_lt_i32 s5, 0
+; GFX12-NEXT: s_add_nc_u64 s[2:3], s[0:1], s[8:9]
+; GFX12-NEXT: s_mov_b32 s0, s6
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_sub_nc_u64 s[8:9], s[6:7], s[4:5]
-; GFX12-NEXT: s_mov_b32 s4, s0
-; GFX12-NEXT: s_cselect_b32 s7, s9, s7
-; GFX12-NEXT: s_cselect_b32 s6, s8, s6
-; GFX12-NEXT: s_cmp_lt_i32 s3, 0
-; GFX12-NEXT: s_sub_nc_u64 s[4:5], s[6:7], s[4:5]
-; GFX12-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3]
-; GFX12-NEXT: s_cselect_b32 s3, s5, s7
-; GFX12-NEXT: s_cselect_b32 s2, s4, s6
-; GFX12-NEXT: s_ashr_i32 s4, s1, 31
+; GFX12-NEXT: s_sub_nc_u64 s[8:9], s[2:3], s[0:1]
+; GFX12-NEXT: s_mov_b32 s0, s4
+; GFX12-NEXT: s_cselect_b32 s3, s9, s3
+; GFX12-NEXT: s_cselect_b32 s2, s8, s2
+; GFX12-NEXT: s_cmp_lt_i32 s7, 0
+; GFX12-NEXT: s_sub_nc_u64 s[0:1], s[2:3], s[0:1]
+; GFX12-NEXT: s_mul_u64 s[4:5], s[4:5], s[6:7]
+; GFX12-NEXT: s_cselect_b32 s1, s1, s3
+; GFX12-NEXT: s_cselect_b32 s0, s0, s2
+; GFX12-NEXT: s_ashr_i32 s2, s5, 31
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s5, s4
-; GFX12-NEXT: s_cmp_lg_u64 s[2:3], s[4:5]
-; GFX12-NEXT: s_cselect_b32 s0, 0, s0
-; GFX12-NEXT: s_cselect_b32 s1, 0, s1
+; GFX12-NEXT: s_mov_b32 s3, s2
+; GFX12-NEXT: s_cmp_lg_u64 s[0:1], s[2:3]
+; GFX12-NEXT: s_cselect_b32 s0, 0, s4
+; GFX12-NEXT: s_cselect_b32 s1, 0, s5
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: global_store_b64 v[0:1], v[0:1], off
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
index 9fcbdf3968693..27ea3e821d726 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
@@ -136,12 +136,12 @@ define amdgpu_kernel void @local_size_xy(ptr addrspace(1) %out) {
; VI-LABEL: local_size_xy:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x18
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mul_i32 s2, s2, s3
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_mul_i32 s0, s2, s3
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -179,14 +179,14 @@ define amdgpu_kernel void @local_size_xz(ptr addrspace(1) %out) {
;
; VI-LABEL: local_size_xz:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s2, s[0:1], 0x18
-; VI-NEXT: s_load_dword s3, s[0:1], 0x20
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x18
+; VI-NEXT: s_load_dword s5, s[0:1], 0x20
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mul_i32 s2, s2, s3
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_mul_i32 s0, s4, s5
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -269,14 +269,14 @@ define amdgpu_kernel void @local_size_xyz(ptr addrspace(1) %out) {
; VI-LABEL: local_size_xyz:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x18
-; VI-NEXT: s_load_dword s4, s[0:1], 0x20
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s6, s[0:1], 0x20
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mul_i32 s2, s2, s3
-; VI-NEXT: s_add_i32 s2, s2, s4
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_mul_i32 s0, s2, s3
+; VI-NEXT: s_add_i32 s0, s0, s6
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
index 84afa3b0096ea..18c910ace6b4b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
@@ -30,38 +30,38 @@ define amdgpu_kernel void @rint_f16(
;
; GFX89-LABEL: rint_f16:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: s_mov_b32 s10, s6
-; GFX89-NEXT: s_mov_b32 s11, s7
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s3, 0xf000
+; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_mov_b32 s10, s2
+; GFX89-NEXT: s_mov_b32 s11, s3
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s8, s2
-; GFX89-NEXT: s_mov_b32 s9, s3
+; GFX89-NEXT: s_mov_b32 s8, s6
+; GFX89-NEXT: s_mov_b32 s9, s7
; GFX89-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GFX89-NEXT: s_mov_b32 s4, s0
-; GFX89-NEXT: s_mov_b32 s5, s1
+; GFX89-NEXT: s_mov_b32 s0, s4
+; GFX89-NEXT: s_mov_b32 s1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_rndne_f16_e32 v0, v0
-; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: rint_f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_rndne_f16_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -111,64 +111,64 @@ define amdgpu_kernel void @rint_v2f16(
;
; VI-LABEL: rint_v2f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_rndne_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: v_rndne_f16_e32 v0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: rint_v2f16:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_rndne_f16_e32 v1, v0
; GFX9-NEXT: v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX11-LABEL: rint_v2f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_rndne_f16_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rndne_f16_e32 v1, v1
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
index c5d2f791d1677..d1e20083e5ffd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
@@ -24,60 +24,42 @@ define amdgpu_kernel void @round_f32(ptr addrspace(1) %out, float %x) #0 {
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
-; GFX8-LABEL: round_f32:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dword s6, s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_trunc_f32_e32 v0, s6
-; GFX8-NEXT: v_sub_f32_e32 v1, s6, v0
-; GFX8-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, 0.5
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[4:5]
-; GFX8-NEXT: s_brev_b32 s4, -2
-; GFX8-NEXT: v_mov_b32_e32 v2, s6
-; GFX8-NEXT: v_bfi_b32 v1, s4, v1, v2
-; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GFX8-NEXT: s_endpgm
-;
-; GFX9-LABEL: round_f32:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_trunc_f32_e32 v0, s2
-; GFX9-NEXT: v_sub_f32_e32 v1, s2, v0
-; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
-; GFX9-NEXT: s_brev_b32 s0, -2
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2
-; GFX9-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; GFX9-NEXT: s_endpgm
+; GFX89-LABEL: round_f32:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX89-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX89-NEXT: s_mov_b32 s7, 0xf000
+; GFX89-NEXT: s_mov_b32 s6, -1
+; GFX89-NEXT: s_waitcnt lgkmcnt(0)
+; GFX89-NEXT: v_trunc_f32_e32 v0, s2
+; GFX89-NEXT: v_sub_f32_e32 v1, s2, v0
+; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
+; GFX89-NEXT: s_brev_b32 s0, -2
+; GFX89-NEXT: v_mov_b32_e32 v2, s2
+; GFX89-NEXT: v_bfi_b32 v1, s0, v1, v2
+; GFX89-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX89-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: round_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_trunc_f32_e32 v0, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_sub_f32_e32 v1, s2, v0
-; GFX11-NEXT: v_cmp_ge_f32_e64 s3, |v1|, 0.5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s3
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v1|, 0.5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s0
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -253,90 +235,52 @@ define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) #
; GFX6-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
-; GFX8-LABEL: round_v4f32:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_brev_b32 s10, -2
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_trunc_f32_e32 v0, s7
-; GFX8-NEXT: v_sub_f32_e32 v1, s7, v0
-; GFX8-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9]
-; GFX8-NEXT: v_mov_b32_e32 v2, s7
-; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v2
-; GFX8-NEXT: v_add_f32_e32 v3, v0, v1
-; GFX8-NEXT: v_trunc_f32_e32 v0, s6
-; GFX8-NEXT: v_sub_f32_e32 v1, s6, v0
-; GFX8-NEXT: v_cmp_ge_f32_e64 s[8:9], |v1|, 0.5
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[8:9]
-; GFX8-NEXT: v_mov_b32_e32 v2, s6
-; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v2
-; GFX8-NEXT: v_add_f32_e32 v2, v0, v1
-; GFX8-NEXT: v_trunc_f32_e32 v0, s5
-; GFX8-NEXT: v_sub_f32_e32 v1, s5, v0
-; GFX8-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5
-; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v4, s5
-; GFX8-NEXT: v_bfi_b32 v1, s10, v1, v4
-; GFX8-NEXT: v_add_f32_e32 v1, v0, v1
-; GFX8-NEXT: v_trunc_f32_e32 v0, s4
-; GFX8-NEXT: v_sub_f32_e32 v4, s4, v0
-; GFX8-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5
-; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v5, s4
-; GFX8-NEXT: v_bfi_b32 v4, s10, v4, v5
-; GFX8-NEXT: v_add_f32_e32 v0, v0, v4
-; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-; GFX8-NEXT: s_endpgm
-;
-; GFX9-LABEL: round_v4f32:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
-; GFX9-NEXT: s_brev_b32 s2, -2
-; GFX9-NEXT: s_mov_b32 s11, 0xf000
-; GFX9-NEXT: s_mov_b32 s10, -1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_trunc_f32_e32 v0, s7
-; GFX9-NEXT: v_sub_f32_e32 v1, s7, v0
-; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, s7
-; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2
-; GFX9-NEXT: v_add_f32_e32 v3, v0, v1
-; GFX9-NEXT: v_trunc_f32_e32 v0, s6
-; GFX9-NEXT: v_sub_f32_e32 v1, s6, v0
-; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v2
-; GFX9-NEXT: v_add_f32_e32 v2, v0, v1
-; GFX9-NEXT: v_trunc_f32_e32 v0, s5
-; GFX9-NEXT: v_sub_f32_e32 v1, s5, v0
-; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v4, s5
-; GFX9-NEXT: v_bfi_b32 v1, s2, v1, v4
-; GFX9-NEXT: v_add_f32_e32 v1, v0, v1
-; GFX9-NEXT: v_trunc_f32_e32 v0, s4
-; GFX9-NEXT: v_sub_f32_e32 v4, s4, v0
-; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5
-; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v5, s4
-; GFX9-NEXT: v_bfi_b32 v4, s2, v4, v5
-; GFX9-NEXT: v_add_f32_e32 v0, v0, v4
-; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
-; GFX9-NEXT: s_endpgm
+; GFX89-LABEL: round_v4f32:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; GFX89-NEXT: s_brev_b32 s2, -2
+; GFX89-NEXT: s_mov_b32 s11, 0xf000
+; GFX89-NEXT: s_mov_b32 s10, -1
+; GFX89-NEXT: s_waitcnt lgkmcnt(0)
+; GFX89-NEXT: v_trunc_f32_e32 v0, s7
+; GFX89-NEXT: v_sub_f32_e32 v1, s7, v0
+; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
+; GFX89-NEXT: v_mov_b32_e32 v2, s7
+; GFX89-NEXT: v_bfi_b32 v1, s2, v1, v2
+; GFX89-NEXT: v_add_f32_e32 v3, v0, v1
+; GFX89-NEXT: v_trunc_f32_e32 v0, s6
+; GFX89-NEXT: v_sub_f32_e32 v1, s6, v0
+; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
+; GFX89-NEXT: v_mov_b32_e32 v2, s6
+; GFX89-NEXT: v_bfi_b32 v1, s2, v1, v2
+; GFX89-NEXT: v_add_f32_e32 v2, v0, v1
+; GFX89-NEXT: v_trunc_f32_e32 v0, s5
+; GFX89-NEXT: v_sub_f32_e32 v1, s5, v0
+; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
+; GFX89-NEXT: v_mov_b32_e32 v4, s5
+; GFX89-NEXT: v_bfi_b32 v1, s2, v1, v4
+; GFX89-NEXT: v_add_f32_e32 v1, v0, v1
+; GFX89-NEXT: v_trunc_f32_e32 v0, s4
+; GFX89-NEXT: v_sub_f32_e32 v4, s4, v0
+; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1]
+; GFX89-NEXT: v_mov_b32_e32 v5, s4
+; GFX89-NEXT: v_bfi_b32 v4, s2, v4, v5
+; GFX89-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: round_v4f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s11, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_trunc_f32_e32 v0, s7
; GFX11-NEXT: v_trunc_f32_e32 v1, s6
@@ -346,27 +290,26 @@ define amdgpu_kernel void @round_v4f32(ptr addrspace(1) %out, <4 x float> %in) #
; GFX11-NEXT: v_dual_sub_f32 v2, s7, v0 :: v_dual_sub_f32 v3, s6, v1
; GFX11-NEXT: v_dual_sub_f32 v6, s5, v4 :: v_dual_sub_f32 v7, s4, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v2|, 0.5
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s2
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v2|, 0.5
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v3|, 0.5
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v3|, 0.5
; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, s7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s2
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v6|, 0.5
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s0
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v6|, 0.5
; GFX11-NEXT: v_bfi_b32 v8, 0x7fffffff, v3, s6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1.0, s2
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v7|, 0.5
+; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1.0, s0
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v7|, 0.5
; GFX11-NEXT: v_dual_add_f32 v3, v0, v2 :: v_dual_add_f32 v2, v1, v8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_bfi_b32 v6, 0x7fffffff, v6, s5
-; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1.0, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1.0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, s4
; GFX11-NEXT: v_dual_add_f32 v1, v4, v6 :: v_dual_add_f32 v0, v5, v7
-; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[8:11], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -481,77 +424,78 @@ define amdgpu_kernel void @round_v8f32(ptr addrspace(1) %out, <8 x float> %in) #
; GFX89-LABEL: round_v8f32:
; GFX89: ; %bb.0:
; GFX89-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
-; GFX89-NEXT: s_brev_b32 s14, -2
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX89-NEXT: s_mov_b32 s3, 0xf000
-; GFX89-NEXT: s_mov_b32 s2, -1
+; GFX89-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24
+; GFX89-NEXT: s_brev_b32 s2, -2
+; GFX89-NEXT: s_mov_b32 s15, 0xf000
+; GFX89-NEXT: s_mov_b32 s14, -1
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_trunc_f32_e32 v0, s7
; GFX89-NEXT: v_sub_f32_e32 v1, s7, v0
-; GFX89-NEXT: v_cmp_ge_f32_e64 s[12:13], |v1|, 0.5
-; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[12:13]
+; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
; GFX89-NEXT: v_mov_b32_e32 v2, s7
-; GFX89-NEXT: v_bfi_b32 v1, s14, v1, v2
+; GFX89-NEXT: v_bfi_b32 v1, s2, v1, v2
; GFX89-NEXT: v_add_f32_e32 v3, v0, v1
; GFX89-NEXT: v_trunc_f32_e32 v0, s6
; GFX89-NEXT: v_sub_f32_e32 v1, s6, v0
-; GFX89-NEXT: v_cmp_ge_f32_e64 s[12:13], |v1|, 0.5
-; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[12:13]
+; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
; GFX89-NEXT: v_mov_b32_e32 v2, s6
-; GFX89-NEXT: v_bfi_b32 v1, s14, v1, v2
+; GFX89-NEXT: v_bfi_b32 v1, s2, v1, v2
; GFX89-NEXT: v_add_f32_e32 v2, v0, v1
; GFX89-NEXT: v_trunc_f32_e32 v0, s5
; GFX89-NEXT: v_sub_f32_e32 v1, s5, v0
-; GFX89-NEXT: v_cmp_ge_f32_e64 s[6:7], |v1|, 0.5
-; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[6:7]
+; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v1|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
; GFX89-NEXT: v_mov_b32_e32 v4, s5
-; GFX89-NEXT: v_bfi_b32 v1, s14, v1, v4
+; GFX89-NEXT: v_bfi_b32 v1, s2, v1, v4
; GFX89-NEXT: v_add_f32_e32 v1, v0, v1
; GFX89-NEXT: v_trunc_f32_e32 v0, s4
; GFX89-NEXT: v_sub_f32_e32 v4, s4, v0
-; GFX89-NEXT: v_cmp_ge_f32_e64 s[6:7], |v4|, 0.5
-; GFX89-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[6:7]
+; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v4|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e64 v4, 0, 1.0, s[0:1]
; GFX89-NEXT: v_mov_b32_e32 v5, s4
-; GFX89-NEXT: v_bfi_b32 v4, s14, v4, v5
+; GFX89-NEXT: v_bfi_b32 v4, s2, v4, v5
; GFX89-NEXT: v_add_f32_e32 v0, v0, v4
; GFX89-NEXT: v_trunc_f32_e32 v4, s11
; GFX89-NEXT: v_sub_f32_e32 v5, s11, v4
-; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5
-; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5]
+; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
; GFX89-NEXT: v_mov_b32_e32 v6, s11
-; GFX89-NEXT: v_bfi_b32 v5, s14, v5, v6
+; GFX89-NEXT: v_bfi_b32 v5, s2, v5, v6
; GFX89-NEXT: v_add_f32_e32 v7, v4, v5
; GFX89-NEXT: v_trunc_f32_e32 v4, s10
; GFX89-NEXT: v_sub_f32_e32 v5, s10, v4
-; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5
-; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5]
+; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
; GFX89-NEXT: v_mov_b32_e32 v6, s10
-; GFX89-NEXT: v_bfi_b32 v5, s14, v5, v6
+; GFX89-NEXT: v_bfi_b32 v5, s2, v5, v6
; GFX89-NEXT: v_add_f32_e32 v6, v4, v5
; GFX89-NEXT: v_trunc_f32_e32 v4, s9
; GFX89-NEXT: v_sub_f32_e32 v5, s9, v4
-; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, 0.5
-; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[4:5]
+; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
; GFX89-NEXT: v_mov_b32_e32 v8, s9
-; GFX89-NEXT: v_bfi_b32 v5, s14, v5, v8
+; GFX89-NEXT: v_bfi_b32 v5, s2, v5, v8
; GFX89-NEXT: v_add_f32_e32 v5, v4, v5
; GFX89-NEXT: v_trunc_f32_e32 v4, s8
; GFX89-NEXT: v_sub_f32_e32 v8, s8, v4
-; GFX89-NEXT: v_cmp_ge_f32_e64 s[4:5], |v8|, 0.5
-; GFX89-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[4:5]
+; GFX89-NEXT: v_cmp_ge_f32_e64 s[0:1], |v8|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e64 v8, 0, 1.0, s[0:1]
; GFX89-NEXT: v_mov_b32_e32 v9, s8
-; GFX89-NEXT: v_bfi_b32 v8, s14, v8, v9
+; GFX89-NEXT: v_bfi_b32 v8, s2, v8, v9
; GFX89-NEXT: v_add_f32_e32 v4, v4, v8
-; GFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX89-NEXT: buffer_store_dwordx4 v[4:7], off, s[12:15], 0 offset:16
+; GFX89-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: round_v8f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x44
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_load_b64 s[12:13], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s15, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_trunc_f32_e32 v0, s7
; GFX11-NEXT: v_trunc_f32_e32 v1, s6
@@ -564,57 +508,56 @@ define amdgpu_kernel void @round_v8f32(ptr addrspace(1) %out, <8 x float> %in) #
; GFX11-NEXT: v_trunc_f32_e32 v9, s9
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_sub_f32_e32 v12, s11, v5
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v2|, 0.5
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v2|, 0.5
; GFX11-NEXT: v_sub_f32_e32 v11, s4, v8
; GFX11-NEXT: v_trunc_f32_e32 v6, s10
; GFX11-NEXT: v_sub_f32_e32 v14, s9, v9
; GFX11-NEXT: v_trunc_f32_e32 v10, s8
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s2
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v3|, 0.5
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s0
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v3|, 0.5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, s7
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s2
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v7|, 0.5
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, s0
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v7|, 0.5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_bfi_b32 v16, 0x7fffffff, v3, s6
-; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1.0, s2
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v11|, 0.5
+; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1.0, s0
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v11|, 0.5
; GFX11-NEXT: v_sub_f32_e32 v13, s10, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_dual_add_f32 v3, v0, v2 :: v_dual_add_f32 v2, v1, v16
; GFX11-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, s5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e64 v11, 0, 1.0, s2
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v12|, 0.5
+; GFX11-NEXT: v_cndmask_b32_e64 v11, 0, 1.0, s0
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v12|, 0.5
; GFX11-NEXT: v_add_f32_e32 v1, v4, v7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_bfi_b32 v11, 0x7fffffff, v11, s4
-; GFX11-NEXT: v_cndmask_b32_e64 v12, 0, 1.0, s2
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v13|, 0.5
+; GFX11-NEXT: v_cndmask_b32_e64 v12, 0, 1.0, s0
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v13|, 0.5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_bfi_b32 v12, 0x7fffffff, v12, s11
-; GFX11-NEXT: v_cndmask_b32_e64 v13, 0, 1.0, s2
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v14|, 0.5
+; GFX11-NEXT: v_cndmask_b32_e64 v13, 0, 1.0, s0
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v14|, 0.5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_f32_e32 v7, v5, v12
; GFX11-NEXT: v_bfi_b32 v13, 0x7fffffff, v13, s10
; GFX11-NEXT: v_sub_f32_e32 v15, s8, v10
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cndmask_b32_e64 v14, 0, 1.0, s2
+; GFX11-NEXT: v_cndmask_b32_e64 v14, 0, 1.0, s0
; GFX11-NEXT: v_add_f32_e32 v6, v6, v13
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_cmp_ge_f32_e64 s2, |v15|, 0.5
+; GFX11-NEXT: v_cmp_ge_f32_e64 s0, |v15|, 0.5
; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v14, s9
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v15, 0, 1.0, s2
+; GFX11-NEXT: v_cndmask_b32_e64 v15, 0, 1.0, s0
; GFX11-NEXT: v_dual_add_f32 v5, v9, v0 :: v_dual_add_f32 v0, v8, v11
-; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v4, 0x7fffffff, v15, s8
; GFX11-NEXT: v_add_f32_e32 v4, v10, v4
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 offset:16
-; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b128 v[4:7], off, s[12:15], 0 offset:16
+; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[12:15], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -702,62 +645,43 @@ define amdgpu_kernel void @round_f16(ptr addrspace(1) %out, i32 %x.arg) #0 {
; GFX6-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
;
-; GFX8-LABEL: round_f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: v_mov_b32_e32 v0, 0x3c00
-; GFX8-NEXT: s_movk_i32 s5, 0x7fff
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_trunc_f16_e32 v1, s4
-; GFX8-NEXT: v_sub_f16_e32 v2, s4, v1
-; GFX8-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5
-; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NEXT: v_bfi_b32 v0, s5, v0, v2
-; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_add_f16_e32 v0, v1, v0
-; GFX8-NEXT: buffer_store_short v0, off, s[0:3], 0
-; GFX8-NEXT: s_endpgm
-;
-; GFX9-LABEL: round_f16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00
-; GFX9-NEXT: s_movk_i32 s0, 0x7fff
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_trunc_f16_e32 v1, s2
-; GFX9-NEXT: v_sub_f16_e32 v2, s2, v1
-; GFX9-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5
-; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX9-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-NEXT: v_bfi_b32 v0, s0, v0, v2
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: v_add_f16_e32 v0, v1, v0
-; GFX9-NEXT: buffer_store_short v0, off, s[4:7], 0
-; GFX9-NEXT: s_endpgm
+; GFX89-LABEL: round_f16:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX89-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX89-NEXT: v_mov_b32_e32 v0, 0x3c00
+; GFX89-NEXT: s_movk_i32 s0, 0x7fff
+; GFX89-NEXT: s_mov_b32 s7, 0xf000
+; GFX89-NEXT: s_waitcnt lgkmcnt(0)
+; GFX89-NEXT: v_trunc_f16_e32 v1, s2
+; GFX89-NEXT: v_sub_f16_e32 v2, s2, v1
+; GFX89-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5
+; GFX89-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX89-NEXT: v_mov_b32_e32 v2, s2
+; GFX89-NEXT: v_bfi_b32 v0, s0, v0, v2
+; GFX89-NEXT: s_mov_b32 s6, -1
+; GFX89-NEXT: v_add_f16_e32 v0, v1, v0
+; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT: s_endpgm
;
; GFX11-LABEL: round_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_trunc_f16_e32 v0, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_sub_f16_e32 v1, s2, v0
-; GFX11-NEXT: v_cmp_ge_f16_e64 s3, |v1|, 0.5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x3c00, s3
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: v_cmp_ge_f16_e64 s0, |v1|, 0.5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 0x3c00, s0
; GFX11-NEXT: v_bfi_b32 v1, 0x7fff, v1, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_f16_e32 v0, v0, v1
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -828,30 +752,30 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 {
;
; GFX8-LABEL: round_v2f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v0, 0x3c00
-; GFX8-NEXT: s_movk_i32 s6, 0x7fff
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_movk_i32 s1, 0x7fff
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_lshr_b32 s5, s4, 16
-; GFX8-NEXT: v_trunc_f16_e32 v1, s5
-; GFX8-NEXT: v_sub_f16_e32 v2, s5, v1
+; GFX8-NEXT: s_lshr_b32 s0, s2, 16
+; GFX8-NEXT: v_trunc_f16_e32 v1, s0
+; GFX8-NEXT: v_sub_f16_e32 v2, s0, v1
; GFX8-NEXT: v_cmp_ge_f16_e64 vcc, |v2|, 0.5
; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v0, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: v_bfi_b32 v2, s6, v2, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NEXT: v_bfi_b32 v2, s1, v2, v3
; GFX8-NEXT: v_add_f16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_trunc_f16_e32 v2, s4
-; GFX8-NEXT: v_sub_f16_e32 v3, s4, v2
+; GFX8-NEXT: v_trunc_f16_e32 v2, s2
+; GFX8-NEXT: v_sub_f16_e32 v3, s2, v2
; GFX8-NEXT: v_cmp_ge_f16_e64 vcc, |v3|, 0.5
; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s4
-; GFX8-NEXT: v_bfi_b32 v0, s6, v0, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, s2
+; GFX8-NEXT: v_bfi_b32 v0, s1, v0, v3
; GFX8-NEXT: v_add_f16_e32 v0, v2, v0
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: round_v2f16:
@@ -886,7 +810,9 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_lshr_b32 s3, s2, 16
; GFX11-NEXT: v_trunc_f16_e32 v1, s2
@@ -895,22 +821,20 @@ define amdgpu_kernel void @round_v2f16(ptr addrspace(1) %out, i32 %in.arg) #0 {
; GFX11-NEXT: v_sub_f16_e32 v3, s2, v1
; GFX11-NEXT: v_sub_f16_e32 v2, s3, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_ge_f16_e64 s4, |v2|, 0.5
-; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s4
+; GFX11-NEXT: v_cmp_ge_f16_e64 s0, |v2|, 0.5
+; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 0x3c00, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_ge_f16_e64 s4, |v3|, 0.5
+; GFX11-NEXT: v_cmp_ge_f16_e64 s0, |v3|, 0.5
; GFX11-NEXT: v_bfi_b32 v2, 0x7fff, v2, s3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 0x3c00, s4
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 0x3c00, s0
; GFX11-NEXT: v_add_f16_e32 v0, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v3, 0x7fff, v3, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: v_add_f16_e32 v1, v1, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
index 2ce0a628686ea..4082ad70e23b2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
@@ -30,55 +30,55 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
;
; GFX8-LABEL: sin_f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mul_f16_e32 v0, 0.15915494, v0
; GFX8-NEXT: v_fract_f16_e32 v0, v0
; GFX8-NEXT: v_sin_f16_e32 v2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sin_f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
; GFX9-NEXT: v_sin_f16_e32 v1, v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sin_f16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
; GFX10-NEXT: v_sin_f16_e32 v1, v1
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: sin_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_sin_f16_e32 v1, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -121,10 +121,10 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
;
; GFX8-LABEL: sin_v2f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_dword v0, v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v1, 0x3118
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -134,50 +134,50 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX8-NEXT: v_fract_f16_e32 v0, v0
; GFX8-NEXT: v_sin_f16_sdwa v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; GFX8-NEXT: v_sin_f16_e32 v3, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_or_b32_e32 v2, v3, v2
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: sin_v2f16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0x3118
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_f16_e32 v3, 0.15915494, v1
; GFX9-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_sin_f16_e32 v2, v3
; GFX9-NEXT: v_sin_f16_e32 v1, v1
; GFX9-NEXT: v_pack_b32_f16 v1, v2, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sin_v2f16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mov_b32_e32 v2, 0x3118
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_f16_e32 v3, 0.15915494, v1
; GFX10-NEXT: v_mul_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX10-NEXT: v_sin_f16_e32 v2, v3
; GFX10-NEXT: v_sin_f16_e32 v1, v1
; GFX10-NEXT: v_pack_b32_f16 v1, v2, v1
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: sin_v2f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1
@@ -188,7 +188,7 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
; GFX11-NEXT: v_sin_f16_e32 v2, v2
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_pack_b32_f16 v1, v1, v2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
index f2d57ba902e73..dc19189484cb5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
@@ -29,38 +29,38 @@ define amdgpu_kernel void @sqrt_f16(
;
; VI-LABEL: sqrt_f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_sqrt_f16_e32 v0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: sqrt_f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -109,37 +109,37 @@ define amdgpu_kernel void @sqrt_v2f16(
;
; VI-LABEL: sqrt_v2f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_sqrt_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: v_sqrt_f16_e32 v0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: sqrt_v2f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
@@ -147,7 +147,7 @@ define amdgpu_kernel void @sqrt_v2f16(
; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
index d1e2ddcdc6eac..3fb1699fb2efe 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
@@ -29,38 +29,38 @@ define amdgpu_kernel void @trunc_f16(
;
; VI-LABEL: trunc_f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_trunc_f16_e32 v0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: trunc_f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_trunc_f16_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -110,44 +110,44 @@ define amdgpu_kernel void @trunc_v2f16(
;
; VI-LABEL: trunc_v2f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_trunc_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: v_trunc_f16_e32 v0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: trunc_v2f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_trunc_f16_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_trunc_f16_e32 v1, v1
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
index cfaefca3a516d..9de4eae7feb2e 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
@@ -34,26 +34,26 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspac
;
; GFX8-NOHSA-LABEL: constant_load_f64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
; GFX12-LABEL: constant_load_f64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index 502cd14284e15..876c24674d046 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -25,13 +25,13 @@ define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace
;
; GFX8-LABEL: constant_load_i1:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
; GFX8-NEXT: flat_store_byte v[0:1], v2
@@ -65,14 +65,14 @@ define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace
;
; GFX12-LABEL: constant_load_i1:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_and_b32 s2, s2, 1
+; GFX12-NEXT: s_and_b32 s0, s0, 1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b8 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b8 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -101,13 +101,13 @@ define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspa
;
; GFX8-LABEL: constant_load_v2i1:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -140,12 +140,12 @@ define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspa
;
; GFX12-LABEL: constant_load_v2i1:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u8 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b8 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b8 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -174,13 +174,13 @@ define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspa
;
; GFX8-LABEL: constant_load_v3i1:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -212,12 +212,12 @@ define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspa
;
; GFX12-LABEL: constant_load_v3i1:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u8 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b8 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b8 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -246,13 +246,13 @@ define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspa
;
; GFX8-LABEL: constant_load_v4i1:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -285,12 +285,12 @@ define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspa
;
; GFX12-LABEL: constant_load_v4i1:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u8 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b8 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b8 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -319,13 +319,13 @@ define amdgpu_kernel void @constant_load_v8i1(ptr addrspace(1) %out, ptr addrspa
;
; GFX8-LABEL: constant_load_v8i1:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -358,12 +358,12 @@ define amdgpu_kernel void @constant_load_v8i1(ptr addrspace(1) %out, ptr addrspa
;
; GFX12-LABEL: constant_load_v8i1:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u8 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b8 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b8 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -392,13 +392,13 @@ define amdgpu_kernel void @constant_load_v16i1(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-LABEL: constant_load_v16i1:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ushort v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -431,12 +431,12 @@ define amdgpu_kernel void @constant_load_v16i1(ptr addrspace(1) %out, ptr addrsp
;
; GFX12-LABEL: constant_load_v16i1:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -460,13 +460,13 @@ define amdgpu_kernel void @constant_load_v32i1(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-LABEL: constant_load_v32i1:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -488,12 +488,12 @@ define amdgpu_kernel void @constant_load_v32i1(ptr addrspace(1) %out, ptr addrsp
;
; GFX12-LABEL: constant_load_v32i1:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -518,14 +518,14 @@ define amdgpu_kernel void @constant_load_v64i1(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-LABEL: constant_load_v64i1:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
@@ -547,13 +547,13 @@ define amdgpu_kernel void @constant_load_v64i1(ptr addrspace(1) %out, ptr addrsp
;
; GFX12-LABEL: constant_load_v64i1:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -582,13 +582,13 @@ define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, pt
;
; GFX8-LABEL: constant_zextload_i1_to_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -611,12 +611,12 @@ define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, pt
;
; GFX12-LABEL: constant_zextload_i1_to_i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -647,13 +647,13 @@ define amdgpu_kernel void @constant_sextload_i1_to_i32(ptr addrspace(1) %out, pt
;
; GFX8-LABEL: constant_sextload_i1_to_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 1
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -678,14 +678,14 @@ define amdgpu_kernel void @constant_sextload_i1_to_i32(ptr addrspace(1) %out, pt
;
; GFX12-LABEL: constant_sextload_i1_to_i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GFX12-NEXT: s_bfe_i32 s0, s0, 0x10000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -715,13 +715,13 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_zextload_v1i1_to_v1i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
@@ -744,12 +744,12 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v1i1_to_v1i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -780,13 +780,13 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_sextload_v1i1_to_v1i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 1
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -811,14 +811,14 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v1i1_to_v1i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GFX12-NEXT: s_bfe_i32 s0, s0, 0x10000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -850,13 +850,13 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_zextload_v2i1_to_v2i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v4, 1, v2
; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v2
@@ -884,17 +884,17 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v2i1_to_v2i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v2, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 1, v0
; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -926,13 +926,13 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_sextload_v2i1_to_v2i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v3, 1, v2
; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 1
@@ -961,16 +961,16 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v2i1_to_v2i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v2, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 1, v0
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 1
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1004,13 +1004,13 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_zextload_v3i1_to_v3i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v3, s0
-; GFX8-NEXT: v_mov_b32_e32 v4, s1
+; GFX8-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v0
; GFX8-NEXT: v_and_b32_e32 v5, 1, v0
@@ -1046,10 +1046,10 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v3i1_to_v3i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v3, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v3, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v3, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 1, v0
; GFX12-NEXT: v_lshrrev_b16 v2, 2, v0
@@ -1060,7 +1060,7 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX12-NEXT: global_store_b96 v3, v[0:2], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1094,13 +1094,13 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_sextload_v3i1_to_v3i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v3, s0
-; GFX8-NEXT: v_mov_b32_e32 v4, s1
+; GFX8-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v0
; GFX8-NEXT: v_lshrrev_b16_e32 v2, 2, v0
@@ -1137,10 +1137,10 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v3i1_to_v3i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v3, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v3, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v3, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 2, v0
; GFX12-NEXT: v_lshrrev_b16 v4, 1, v0
@@ -1148,7 +1148,7 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_bfe_i32 v2, v1, 0, 1
; GFX12-NEXT: v_bfe_i32 v1, v4, 0, 1
-; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX12-NEXT: global_store_b96 v3, v[0:2], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1182,13 +1182,13 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_zextload_v4i1_to_v4i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v0
; GFX8-NEXT: v_lshrrev_b16_e32 v2, 2, v0
@@ -1226,10 +1226,10 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v4i1_to_v4i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v4, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v4, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 2, v0
; GFX12-NEXT: v_lshrrev_b16 v2, 1, v0
@@ -1244,7 +1244,7 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v1
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v5
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1278,13 +1278,13 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_sextload_v4i1_to_v4i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v1, 1, v0
; GFX8-NEXT: v_lshrrev_b16_e32 v2, 2, v0
@@ -1323,10 +1323,10 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v4i1_to_v4i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v4, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v4, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 3, v0
; GFX12-NEXT: v_lshrrev_b16 v2, 2, v0
@@ -1337,7 +1337,7 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out
; GFX12-NEXT: v_bfe_i32 v2, v2, 0, 1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX12-NEXT: v_bfe_i32 v1, v5, 0, 1
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1376,17 +1376,17 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_zextload_v8i1_to_v8i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v1, v[0:1]
-; GFX8-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v11, s3
-; GFX8-NEXT: v_mov_b32_e32 v9, s1
-; GFX8-NEXT: v_mov_b32_e32 v10, s2
-; GFX8-NEXT: v_mov_b32_e32 v8, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v11, s1
+; GFX8-NEXT: v_mov_b32_e32 v9, s5
+; GFX8-NEXT: v_mov_b32_e32 v10, s0
+; GFX8-NEXT: v_mov_b32_e32 v8, s4
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v2, 5, v1
; GFX8-NEXT: v_lshrrev_b16_e32 v5, 1, v1
@@ -1443,10 +1443,10 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v8i1_to_v8i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v8, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v8, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v8, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v2, 5, v0
; GFX12-NEXT: v_lshrrev_b16 v5, 1, v0
@@ -1467,8 +1467,8 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out
; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v9
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v10
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1507,17 +1507,17 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_sextload_v8i1_to_v8i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX8-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v11, s3
-; GFX8-NEXT: v_mov_b32_e32 v9, s1
-; GFX8-NEXT: v_mov_b32_e32 v10, s2
-; GFX8-NEXT: v_mov_b32_e32 v8, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v11, s1
+; GFX8-NEXT: v_mov_b32_e32 v9, s5
+; GFX8-NEXT: v_mov_b32_e32 v10, s0
+; GFX8-NEXT: v_mov_b32_e32 v8, s4
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v4, 4, v0
; GFX8-NEXT: v_lshrrev_b16_e32 v5, 5, v0
@@ -1578,10 +1578,10 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v8i1_to_v8i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v8, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v8, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v8, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 4, v0
; GFX12-NEXT: v_lshrrev_b16 v4, 5, v0
@@ -1599,8 +1599,8 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out
; GFX12-NEXT: v_bfe_i32 v4, v1, 0, 1
; GFX12-NEXT: v_bfe_i32 v1, v9, 0, 1
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1649,25 +1649,25 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o
;
; GFX8-LABEL: constant_zextload_v16i1_to_v16i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ushort v1, v[0:1]
-; GFX8-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v19, s3
-; GFX8-NEXT: v_mov_b32_e32 v18, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 32
+; GFX8-NEXT: s_add_u32 s0, s4, 48
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v19, s1
+; GFX8-NEXT: v_mov_b32_e32 v18, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 32
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v0, 1
-; GFX8-NEXT: v_mov_b32_e32 v17, s1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v16, s0
-; GFX8-NEXT: s_add_u32 s0, s0, 16
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v21, s3
+; GFX8-NEXT: v_mov_b32_e32 v21, s1
+; GFX8-NEXT: v_mov_b32_e32 v20, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v17, s5
; GFX8-NEXT: v_mov_b32_e32 v23, s1
-; GFX8-NEXT: v_mov_b32_e32 v20, s2
+; GFX8-NEXT: v_mov_b32_e32 v16, s4
; GFX8-NEXT: v_mov_b32_e32 v22, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v3, 12, v1
@@ -1767,10 +1767,10 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_zextload_v16i1_to_v16i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v16, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v0, v16, s[2:3]
+; GFX12-NEXT: global_load_u16 v0, v16, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v2, 13, v0
; GFX12-NEXT: v_lshrrev_b16 v13, 1, v0
@@ -1811,10 +1811,10 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v22
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1863,24 +1863,24 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o
;
; GFX8-LABEL: constant_sextload_v16i1_to_v16i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
-; GFX8-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v19, s3
-; GFX8-NEXT: v_mov_b32_e32 v18, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 32
-; GFX8-NEXT: v_mov_b32_e32 v17, s1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v16, s0
-; GFX8-NEXT: s_add_u32 s0, s0, 16
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v21, s3
+; GFX8-NEXT: s_add_u32 s0, s4, 48
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v19, s1
+; GFX8-NEXT: v_mov_b32_e32 v18, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 32
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v21, s1
+; GFX8-NEXT: v_mov_b32_e32 v20, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v17, s5
; GFX8-NEXT: v_mov_b32_e32 v23, s1
-; GFX8-NEXT: v_mov_b32_e32 v20, s2
+; GFX8-NEXT: v_mov_b32_e32 v16, s4
; GFX8-NEXT: v_mov_b32_e32 v22, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v12, 12, v0
@@ -1990,10 +1990,10 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_sextload_v16i1_to_v16i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v16, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v0, v16, s[2:3]
+; GFX12-NEXT: global_load_u16 v0, v16, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 12, v0
; GFX12-NEXT: v_lshrrev_b16 v4, 13, v0
@@ -2027,10 +2027,10 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1
; GFX12-NEXT: v_bfe_i32 v1, v19, 0, 1
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2132,112 +2132,112 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o
;
; GFX8-LABEL: constant_zextload_v32i1_to_v32i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s4, s[2:3], 0x0
+; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 13, s4
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 13, s2
; GFX8-NEXT: v_and_b32_e32 v24, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s4
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s2
; GFX8-NEXT: v_and_b32_e32 v22, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 7, s4
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 7, s2
; GFX8-NEXT: v_and_b32_e32 v15, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 1, s4
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 1, s2
; GFX8-NEXT: v_and_b32_e32 v23, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s4
-; GFX8-NEXT: s_lshr_b32 s2, s4, 24
-; GFX8-NEXT: v_lshrrev_b16_e64 v2, 9, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v3, 11, s4
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s2
+; GFX8-NEXT: s_lshr_b32 s0, s2, 24
+; GFX8-NEXT: v_lshrrev_b16_e64 v2, 9, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v3, 11, s2
; GFX8-NEXT: v_and_b32_e32 v26, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s0
; GFX8-NEXT: v_and_b32_e32 v17, 1, v2
; GFX8-NEXT: v_and_b32_e32 v18, 1, v3
-; GFX8-NEXT: v_lshrrev_b16_e64 v4, 4, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v4, 4, s0
; GFX8-NEXT: v_and_b32_e32 v5, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v6, 6, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 1, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v2, 2, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v3, 3, s2
-; GFX8-NEXT: s_bfe_u32 s5, s4, 0x10018
-; GFX8-NEXT: v_lshrrev_b16_e64 v7, 7, s2
-; GFX8-NEXT: s_and_b32 s6, s4, 1
-; GFX8-NEXT: s_bfe_u32 s7, s4, 0x10013
-; GFX8-NEXT: s_bfe_u32 s8, s4, 0x10012
-; GFX8-NEXT: s_bfe_u32 s9, s4, 0x10011
-; GFX8-NEXT: s_bfe_u32 s10, s4, 0x10010
-; GFX8-NEXT: s_bfe_u32 s2, s4, 0x10017
-; GFX8-NEXT: s_bfe_u32 s3, s4, 0x10016
-; GFX8-NEXT: s_bfe_u32 s11, s4, 0x10015
-; GFX8-NEXT: s_bfe_u32 s12, s4, 0x10014
-; GFX8-NEXT: v_mov_b32_e32 v11, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 0x50
-; GFX8-NEXT: v_mov_b32_e32 v10, s3
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v13, s3
-; GFX8-NEXT: v_mov_b32_e32 v12, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 64
+; GFX8-NEXT: v_lshrrev_b16_e64 v6, 6, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 1, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v2, 2, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v3, 3, s0
+; GFX8-NEXT: s_bfe_u32 s3, s2, 0x10018
+; GFX8-NEXT: v_lshrrev_b16_e64 v7, 7, s0
+; GFX8-NEXT: s_and_b32 s6, s2, 1
+; GFX8-NEXT: s_bfe_u32 s7, s2, 0x10013
+; GFX8-NEXT: s_bfe_u32 s8, s2, 0x10012
+; GFX8-NEXT: s_bfe_u32 s9, s2, 0x10011
+; GFX8-NEXT: s_bfe_u32 s10, s2, 0x10010
+; GFX8-NEXT: s_bfe_u32 s0, s2, 0x10017
+; GFX8-NEXT: s_bfe_u32 s1, s2, 0x10016
+; GFX8-NEXT: s_bfe_u32 s11, s2, 0x10015
+; GFX8-NEXT: s_bfe_u32 s12, s2, 0x10014
+; GFX8-NEXT: v_mov_b32_e32 v11, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0x50
+; GFX8-NEXT: v_mov_b32_e32 v10, s1
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v13, s1
+; GFX8-NEXT: v_mov_b32_e32 v12, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 64
; GFX8-NEXT: v_mov_b32_e32 v8, s12
; GFX8-NEXT: v_mov_b32_e32 v9, s11
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
-; GFX8-NEXT: v_mov_b32_e32 v13, s3
-; GFX8-NEXT: v_mov_b32_e32 v12, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NEXT: v_lshrrev_b16_e64 v21, 14, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v25, 2, s4
+; GFX8-NEXT: v_mov_b32_e32 v13, s1
+; GFX8-NEXT: v_mov_b32_e32 v12, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 48
+; GFX8-NEXT: v_lshrrev_b16_e64 v21, 14, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v25, 2, s2
; GFX8-NEXT: v_mov_b32_e32 v8, s10
; GFX8-NEXT: v_mov_b32_e32 v9, s9
; GFX8-NEXT: v_mov_b32_e32 v10, s8
; GFX8-NEXT: v_mov_b32_e32 v11, s7
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
; GFX8-NEXT: v_and_b32_e32 v13, 0xffff, v22
; GFX8-NEXT: v_and_b32_e32 v10, 1, v25
; GFX8-NEXT: v_and_b32_e32 v22, 1, v21
; GFX8-NEXT: v_and_b32_e32 v21, 0xffff, v24
-; GFX8-NEXT: v_mov_b32_e32 v25, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v20, 12, s4
-; GFX8-NEXT: v_mov_b32_e32 v24, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 32
+; GFX8-NEXT: v_mov_b32_e32 v25, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v20, 12, s2
+; GFX8-NEXT: v_mov_b32_e32 v24, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 32
; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v23
-; GFX8-NEXT: v_lshrrev_b16_e64 v23, 15, s4
+; GFX8-NEXT: v_lshrrev_b16_e64 v23, 15, s2
; GFX8-NEXT: v_and_b32_e32 v20, 1, v20
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[20:23]
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v21, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v16, 10, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v19, 4, s4
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v21, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v16, 10, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v19, 4, s2
; GFX8-NEXT: v_mov_b32_e32 v8, 1
-; GFX8-NEXT: v_mov_b32_e32 v20, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 16
+; GFX8-NEXT: v_mov_b32_e32 v20, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
; GFX8-NEXT: v_and_b32_e32 v12, 1, v19
; GFX8-NEXT: v_and_b32_e32 v19, 0xffff, v18
; GFX8-NEXT: v_and_b32_e32 v18, 1, v16
; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v17
; GFX8-NEXT: v_and_b32_sdwa v16, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_lshrrev_b16_e64 v14, 6, s4
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_lshrrev_b16_e64 v14, 6, s2
; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
; GFX8-NEXT: v_and_b32_e32 v15, 0xffff, v15
-; GFX8-NEXT: v_mov_b32_e32 v17, s3
+; GFX8-NEXT: v_mov_b32_e32 v17, s1
; GFX8-NEXT: v_and_b32_e32 v14, 1, v14
-; GFX8-NEXT: v_mov_b32_e32 v16, s2
+; GFX8-NEXT: v_mov_b32_e32 v16, s0
; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
-; GFX8-NEXT: s_add_u32 s2, s0, 0x70
-; GFX8-NEXT: v_mov_b32_e32 v13, s1
+; GFX8-NEXT: s_add_u32 s0, s4, 0x70
+; GFX8-NEXT: v_mov_b32_e32 v13, s5
; GFX8-NEXT: v_and_b32_e32 v11, 0xffff, v26
; GFX8-NEXT: v_mov_b32_e32 v8, s6
-; GFX8-NEXT: v_mov_b32_e32 v12, s0
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v12, s4
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
-; GFX8-NEXT: s_add_u32 s0, s0, 0x60
-; GFX8-NEXT: v_mov_b32_e32 v9, s3
; GFX8-NEXT: v_and_b32_e32 v6, 1, v6
+; GFX8-NEXT: v_mov_b32_e32 v9, s1
+; GFX8-NEXT: v_mov_b32_e32 v8, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0x60
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX8-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX8-NEXT: v_mov_b32_e32 v8, s2
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
; GFX8-NEXT: v_and_b32_e32 v3, 1, v3
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
@@ -2245,7 +2245,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX8-NEXT: v_mov_b32_e32 v0, s5
+; GFX8-NEXT: v_mov_b32_e32 v0, s3
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
@@ -2349,56 +2349,56 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_zextload_v32i1_to_v32i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s3, s2, 24
-; GFX12-NEXT: v_lshrrev_b16 v1, 13, s2
-; GFX12-NEXT: v_lshrrev_b16 v2, 9, s2
-; GFX12-NEXT: v_lshrrev_b16 v4, 11, s2
-; GFX12-NEXT: v_lshrrev_b16 v6, 5, s2
-; GFX12-NEXT: v_lshrrev_b16 v9, 7, s2
-; GFX12-NEXT: v_lshrrev_b16 v13, 3, s2
-; GFX12-NEXT: v_lshrrev_b16 v14, 5, s3
-; GFX12-NEXT: v_lshrrev_b16 v18, 1, s3
-; GFX12-NEXT: v_lshrrev_b16 v21, 3, s3
-; GFX12-NEXT: v_lshrrev_b16 v10, 1, s2
+; GFX12-NEXT: s_lshr_b32 s1, s0, 24
+; GFX12-NEXT: v_lshrrev_b16 v1, 13, s0
+; GFX12-NEXT: v_lshrrev_b16 v2, 9, s0
+; GFX12-NEXT: v_lshrrev_b16 v4, 11, s0
+; GFX12-NEXT: v_lshrrev_b16 v6, 5, s0
+; GFX12-NEXT: v_lshrrev_b16 v9, 7, s0
+; GFX12-NEXT: v_lshrrev_b16 v13, 3, s0
+; GFX12-NEXT: v_lshrrev_b16 v14, 5, s1
+; GFX12-NEXT: v_lshrrev_b16 v18, 1, s1
+; GFX12-NEXT: v_lshrrev_b16 v21, 3, s1
+; GFX12-NEXT: v_lshrrev_b16 v10, 1, s0
; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_and_b32 v33, 1, v1
-; GFX12-NEXT: v_lshrrev_b16 v0, 12, s2
-; GFX12-NEXT: v_lshrrev_b16 v12, 14, s2
-; GFX12-NEXT: v_lshrrev_b16 v20, 15, s2
-; GFX12-NEXT: v_lshrrev_b16 v8, 8, s2
-; GFX12-NEXT: v_lshrrev_b16 v3, 10, s2
-; GFX12-NEXT: v_lshrrev_b16 v5, 4, s2
-; GFX12-NEXT: v_lshrrev_b16 v7, 6, s2
-; GFX12-NEXT: v_lshrrev_b16 v11, 2, s2
-; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10018
-; GFX12-NEXT: s_and_b32 s5, s2, 1
-; GFX12-NEXT: v_lshrrev_b16 v15, 4, s3
-; GFX12-NEXT: v_lshrrev_b16 v16, 6, s3
-; GFX12-NEXT: v_lshrrev_b16 v17, 7, s3
-; GFX12-NEXT: v_lshrrev_b16 v19, 2, s3
+; GFX12-NEXT: v_lshrrev_b16 v0, 12, s0
+; GFX12-NEXT: v_lshrrev_b16 v12, 14, s0
+; GFX12-NEXT: v_lshrrev_b16 v20, 15, s0
+; GFX12-NEXT: v_lshrrev_b16 v8, 8, s0
+; GFX12-NEXT: v_lshrrev_b16 v3, 10, s0
+; GFX12-NEXT: v_lshrrev_b16 v5, 4, s0
+; GFX12-NEXT: v_lshrrev_b16 v7, 6, s0
+; GFX12-NEXT: v_lshrrev_b16 v11, 2, s0
+; GFX12-NEXT: s_bfe_u32 s2, s0, 0x10018
+; GFX12-NEXT: s_and_b32 s3, s0, 1
+; GFX12-NEXT: v_lshrrev_b16 v15, 4, s1
+; GFX12-NEXT: v_lshrrev_b16 v16, 6, s1
+; GFX12-NEXT: v_lshrrev_b16 v17, 7, s1
+; GFX12-NEXT: v_lshrrev_b16 v19, 2, s1
; GFX12-NEXT: v_and_b32_e32 v25, 1, v14
; GFX12-NEXT: v_and_b32_e32 v26, 1, v18
; GFX12-NEXT: v_and_b32_e32 v21, 1, v21
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10013
-; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10012
+; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10013
+; GFX12-NEXT: s_bfe_u32 s6, s0, 0x10012
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v30, s6 :: v_dual_and_b32 v13, 1, v13
-; GFX12-NEXT: s_bfe_u32 s7, s2, 0x10011
-; GFX12-NEXT: s_bfe_u32 s8, s2, 0x10010
-; GFX12-NEXT: s_bfe_u32 s9, s2, 0x10017
+; GFX12-NEXT: s_bfe_u32 s7, s0, 0x10011
+; GFX12-NEXT: s_bfe_u32 s8, s0, 0x10010
+; GFX12-NEXT: s_bfe_u32 s9, s0, 0x10017
; GFX12-NEXT: v_dual_mov_b32 v27, s9 :: v_dual_and_b32 v24, 1, v6
-; GFX12-NEXT: s_bfe_u32 s10, s2, 0x10016
+; GFX12-NEXT: s_bfe_u32 s10, s0, 0x10016
; GFX12-NEXT: v_and_b32_e32 v9, 1, v9
-; GFX12-NEXT: s_bfe_u32 s11, s2, 0x10014
+; GFX12-NEXT: s_bfe_u32 s11, s0, 0x10014
; GFX12-NEXT: v_and_b32_e32 v23, 1, v4
-; GFX12-NEXT: s_bfe_u32 s2, s2, 0x10015
+; GFX12-NEXT: s_bfe_u32 s0, s0, 0x10015
; GFX12-NEXT: v_and_b32_e32 v22, 1, v2
; GFX12-NEXT: v_dual_mov_b32 v28, s8 :: v_dual_and_b32 v1, 1, v10
; GFX12-NEXT: v_dual_mov_b32 v29, s7 :: v_dual_and_b32 v2, 1, v11
-; GFX12-NEXT: v_dual_mov_b32 v31, s3 :: v_dual_and_b32 v6, 1, v7
+; GFX12-NEXT: v_dual_mov_b32 v31, s1 :: v_dual_and_b32 v6, 1, v7
; GFX12-NEXT: v_and_b32_e32 v4, 1, v5
; GFX12-NEXT: v_and_b32_e32 v10, 1, v3
; GFX12-NEXT: v_and_b32_e32 v14, 1, v19
@@ -2412,23 +2412,23 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: v_dual_mov_b32 v26, s10 :: v_dual_and_b32 v23, 0xffff, v20
; GFX12-NEXT: v_and_b32_e32 v7, 0xffff, v9
; GFX12-NEXT: v_and_b32_e32 v20, 1, v0
-; GFX12-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_and_b32 v17, 0xffff, v25
-; GFX12-NEXT: v_mov_b32_e32 v25, s2
+; GFX12-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_and_b32 v17, 0xffff, v25
+; GFX12-NEXT: v_mov_b32_e32 v25, s0
; GFX12-NEXT: v_and_b32_e32 v9, 0xffff, v22
; GFX12-NEXT: v_and_b32_e32 v22, 1, v12
-; GFX12-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_and_b32 v15, 0xffff, v21
+; GFX12-NEXT: v_dual_mov_b32 v12, s2 :: v_dual_and_b32 v15, 0xffff, v21
; GFX12-NEXT: v_and_b32_e32 v21, 0xffff, v33
; GFX12-NEXT: v_and_b32_e32 v8, 1, v8
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX12-NEXT: s_clause 0x7
-; GFX12-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:80
-; GFX12-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:64
-; GFX12-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v32, v[0:3], s[0:1]
-; GFX12-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:96
+; GFX12-NEXT: global_store_b128 v32, v[24:27], s[4:5] offset:80
+; GFX12-NEXT: global_store_b128 v32, v[28:31], s[4:5] offset:64
+; GFX12-NEXT: global_store_b128 v32, v[20:23], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v32, v[8:11], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v32, v[4:7], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v32, v[0:3], s[4:5]
+; GFX12-NEXT: global_store_b128 v32, v[16:19], s[4:5] offset:112
+; GFX12-NEXT: global_store_b128 v32, v[12:15], s[4:5] offset:96
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2530,111 +2530,111 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o
;
; GFX8-LABEL: constant_sextload_v32i1_to_v32i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_lshr_b32 s3, s2, 24
-; GFX8-NEXT: v_lshrrev_b16_e64 v8, 12, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v20, 13, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v21, 14, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v22, 15, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v16, 8, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v17, 9, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v18, 10, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v19, 11, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v12, 4, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v13, 5, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v14, 6, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v15, 7, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v9, 1, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v10, 2, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v11, 3, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v23, 4, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v24, 5, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v6, 6, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v7, 7, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v25, 1, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v26, 2, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v27, 3, s3
-; GFX8-NEXT: s_bfe_i32 s4, s2, 0x10018
-; GFX8-NEXT: s_bfe_i32 s5, s2, 0x10000
-; GFX8-NEXT: s_bfe_i32 s6, s2, 0x10013
-; GFX8-NEXT: s_bfe_i32 s7, s2, 0x10012
-; GFX8-NEXT: s_bfe_i32 s8, s2, 0x10011
-; GFX8-NEXT: s_bfe_i32 s9, s2, 0x10010
-; GFX8-NEXT: s_bfe_i32 s3, s2, 0x10017
-; GFX8-NEXT: s_bfe_i32 s10, s2, 0x10016
-; GFX8-NEXT: s_bfe_i32 s11, s2, 0x10015
-; GFX8-NEXT: s_bfe_i32 s2, s2, 0x10014
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 0x50
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 64
+; GFX8-NEXT: s_lshr_b32 s1, s0, 24
+; GFX8-NEXT: v_lshrrev_b16_e64 v8, 12, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v20, 13, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v21, 14, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v22, 15, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v16, 8, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v17, 9, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v18, 10, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v19, 11, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v12, 4, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v13, 5, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v14, 6, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v15, 7, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v9, 1, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v10, 2, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v11, 3, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v23, 4, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v24, 5, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v6, 6, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v7, 7, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v25, 1, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v26, 2, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v27, 3, s1
+; GFX8-NEXT: s_bfe_i32 s2, s0, 0x10018
+; GFX8-NEXT: s_bfe_i32 s3, s0, 0x10000
+; GFX8-NEXT: s_bfe_i32 s6, s0, 0x10013
+; GFX8-NEXT: s_bfe_i32 s7, s0, 0x10012
+; GFX8-NEXT: s_bfe_i32 s8, s0, 0x10011
+; GFX8-NEXT: s_bfe_i32 s9, s0, 0x10010
+; GFX8-NEXT: s_bfe_i32 s1, s0, 0x10017
+; GFX8-NEXT: s_bfe_i32 s10, s0, 0x10016
+; GFX8-NEXT: s_bfe_i32 s11, s0, 0x10015
+; GFX8-NEXT: s_bfe_i32 s0, s0, 0x10014
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0x50
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 64
; GFX8-NEXT: v_mov_b32_e32 v1, s11
; GFX8-NEXT: v_mov_b32_e32 v2, s10
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 48
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 48
; GFX8-NEXT: v_mov_b32_e32 v0, s9
; GFX8-NEXT: v_mov_b32_e32 v1, s8
; GFX8-NEXT: v_mov_b32_e32 v2, s7
; GFX8-NEXT: v_mov_b32_e32 v3, s6
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_bfe_i32 v5, v24, 0, 1
; GFX8-NEXT: v_bfe_i32 v1, v25, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v25, s3
-; GFX8-NEXT: v_mov_b32_e32 v24, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 32
+; GFX8-NEXT: v_mov_b32_e32 v25, s1
+; GFX8-NEXT: v_mov_b32_e32 v24, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 32
; GFX8-NEXT: v_bfe_i32 v4, v23, 0, 1
; GFX8-NEXT: v_bfe_i32 v23, v22, 0, 1
; GFX8-NEXT: v_bfe_i32 v22, v21, 0, 1
; GFX8-NEXT: v_bfe_i32 v21, v20, 0, 1
; GFX8-NEXT: v_bfe_i32 v20, v8, 0, 1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[20:23]
; GFX8-NEXT: v_bfe_i32 v19, v19, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v21, s3
-; GFX8-NEXT: v_mov_b32_e32 v20, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 16
+; GFX8-NEXT: v_mov_b32_e32 v21, s1
+; GFX8-NEXT: v_mov_b32_e32 v20, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
; GFX8-NEXT: v_bfe_i32 v18, v18, 0, 1
; GFX8-NEXT: v_bfe_i32 v17, v17, 0, 1
; GFX8-NEXT: v_bfe_i32 v16, v16, 0, 1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
; GFX8-NEXT: v_bfe_i32 v15, v15, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v17, s3
+; GFX8-NEXT: v_mov_b32_e32 v17, s1
; GFX8-NEXT: v_bfe_i32 v14, v14, 0, 1
; GFX8-NEXT: v_bfe_i32 v13, v13, 0, 1
; GFX8-NEXT: v_bfe_i32 v12, v12, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v16, s2
+; GFX8-NEXT: v_mov_b32_e32 v16, s0
; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
-; GFX8-NEXT: s_add_u32 s2, s0, 0x70
-; GFX8-NEXT: v_mov_b32_e32 v13, s1
+; GFX8-NEXT: s_add_u32 s0, s4, 0x70
+; GFX8-NEXT: v_mov_b32_e32 v13, s5
; GFX8-NEXT: v_bfe_i32 v11, v11, 0, 1
; GFX8-NEXT: v_bfe_i32 v10, v10, 0, 1
; GFX8-NEXT: v_bfe_i32 v9, v9, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v8, s5
-; GFX8-NEXT: v_mov_b32_e32 v12, s0
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v8, s3
+; GFX8-NEXT: v_mov_b32_e32 v12, s4
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
-; GFX8-NEXT: s_add_u32 s0, s0, 0x60
-; GFX8-NEXT: v_mov_b32_e32 v9, s3
; GFX8-NEXT: v_bfe_i32 v7, v7, 0, 1
+; GFX8-NEXT: v_mov_b32_e32 v9, s1
+; GFX8-NEXT: v_mov_b32_e32 v8, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0x60
; GFX8-NEXT: v_bfe_i32 v6, v6, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v8, s2
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GFX8-NEXT: v_bfe_i32 v3, v27, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_bfe_i32 v2, v26, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
@@ -2770,48 +2770,48 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_sextload_v32i1_to_v32i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v0, 12, s2
-; GFX12-NEXT: v_lshrrev_b16 v12, 13, s2
-; GFX12-NEXT: v_lshrrev_b16 v13, 14, s2
-; GFX12-NEXT: v_lshrrev_b16 v14, 15, s2
-; GFX12-NEXT: v_lshrrev_b16 v8, 8, s2
-; GFX12-NEXT: v_lshrrev_b16 v9, 9, s2
-; GFX12-NEXT: v_lshrrev_b16 v10, 10, s2
-; GFX12-NEXT: v_lshrrev_b16 v11, 11, s2
-; GFX12-NEXT: s_lshr_b32 s3, s2, 24
-; GFX12-NEXT: v_lshrrev_b16 v4, 4, s2
-; GFX12-NEXT: v_lshrrev_b16 v5, 5, s2
-; GFX12-NEXT: v_lshrrev_b16 v6, 6, s2
-; GFX12-NEXT: v_lshrrev_b16 v7, 7, s2
-; GFX12-NEXT: v_lshrrev_b16 v1, 1, s2
-; GFX12-NEXT: v_lshrrev_b16 v2, 2, s2
-; GFX12-NEXT: v_lshrrev_b16 v3, 3, s2
-; GFX12-NEXT: s_bfe_i32 s4, s2, 0x10018
-; GFX12-NEXT: s_bfe_i32 s5, s2, 0x10000
-; GFX12-NEXT: s_bfe_i32 s6, s2, 0x10013
-; GFX12-NEXT: s_bfe_i32 s7, s2, 0x10012
-; GFX12-NEXT: v_lshrrev_b16 v16, 4, s3
-; GFX12-NEXT: v_lshrrev_b16 v20, 5, s3
-; GFX12-NEXT: v_lshrrev_b16 v21, 6, s3
-; GFX12-NEXT: v_lshrrev_b16 v22, 7, s3
-; GFX12-NEXT: v_lshrrev_b16 v17, 1, s3
-; GFX12-NEXT: v_lshrrev_b16 v18, 2, s3
-; GFX12-NEXT: v_lshrrev_b16 v19, 3, s3
-; GFX12-NEXT: s_bfe_i32 s3, s2, 0x10011
-; GFX12-NEXT: s_bfe_i32 s8, s2, 0x10010
-; GFX12-NEXT: s_bfe_i32 s9, s2, 0x10017
-; GFX12-NEXT: s_bfe_i32 s10, s2, 0x10016
-; GFX12-NEXT: s_bfe_i32 s11, s2, 0x10014
-; GFX12-NEXT: s_bfe_i32 s2, s2, 0x10015
+; GFX12-NEXT: v_lshrrev_b16 v0, 12, s0
+; GFX12-NEXT: v_lshrrev_b16 v12, 13, s0
+; GFX12-NEXT: v_lshrrev_b16 v13, 14, s0
+; GFX12-NEXT: v_lshrrev_b16 v14, 15, s0
+; GFX12-NEXT: v_lshrrev_b16 v8, 8, s0
+; GFX12-NEXT: v_lshrrev_b16 v9, 9, s0
+; GFX12-NEXT: v_lshrrev_b16 v10, 10, s0
+; GFX12-NEXT: v_lshrrev_b16 v11, 11, s0
+; GFX12-NEXT: s_lshr_b32 s1, s0, 24
+; GFX12-NEXT: v_lshrrev_b16 v4, 4, s0
+; GFX12-NEXT: v_lshrrev_b16 v5, 5, s0
+; GFX12-NEXT: v_lshrrev_b16 v6, 6, s0
+; GFX12-NEXT: v_lshrrev_b16 v7, 7, s0
+; GFX12-NEXT: v_lshrrev_b16 v1, 1, s0
+; GFX12-NEXT: v_lshrrev_b16 v2, 2, s0
+; GFX12-NEXT: v_lshrrev_b16 v3, 3, s0
+; GFX12-NEXT: s_bfe_i32 s2, s0, 0x10018
+; GFX12-NEXT: s_bfe_i32 s3, s0, 0x10000
+; GFX12-NEXT: s_bfe_i32 s6, s0, 0x10013
+; GFX12-NEXT: s_bfe_i32 s7, s0, 0x10012
+; GFX12-NEXT: v_lshrrev_b16 v16, 4, s1
+; GFX12-NEXT: v_lshrrev_b16 v20, 5, s1
+; GFX12-NEXT: v_lshrrev_b16 v21, 6, s1
+; GFX12-NEXT: v_lshrrev_b16 v22, 7, s1
+; GFX12-NEXT: v_lshrrev_b16 v17, 1, s1
+; GFX12-NEXT: v_lshrrev_b16 v18, 2, s1
+; GFX12-NEXT: v_lshrrev_b16 v19, 3, s1
+; GFX12-NEXT: s_bfe_i32 s1, s0, 0x10011
+; GFX12-NEXT: s_bfe_i32 s8, s0, 0x10010
+; GFX12-NEXT: s_bfe_i32 s9, s0, 0x10017
+; GFX12-NEXT: s_bfe_i32 s10, s0, 0x10016
+; GFX12-NEXT: s_bfe_i32 s11, s0, 0x10014
+; GFX12-NEXT: s_bfe_i32 s0, s0, 0x10015
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v25, s2
+; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v25, s0
; GFX12-NEXT: v_bfe_i32 v15, v14, 0, 1
; GFX12-NEXT: v_dual_mov_b32 v24, s11 :: v_dual_mov_b32 v27, s9
-; GFX12-NEXT: v_dual_mov_b32 v26, s10 :: v_dual_mov_b32 v29, s3
+; GFX12-NEXT: v_dual_mov_b32 v26, s10 :: v_dual_mov_b32 v29, s1
; GFX12-NEXT: v_bfe_i32 v14, v13, 0, 1
; GFX12-NEXT: v_bfe_i32 v13, v12, 0, 1
; GFX12-NEXT: v_bfe_i32 v12, v0, 0, 1
@@ -2828,7 +2828,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: v_bfe_i32 v3, v3, 0, 1
; GFX12-NEXT: v_bfe_i32 v2, v2, 0, 1
; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 1
-; GFX12-NEXT: v_mov_b32_e32 v0, s5
+; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_bfe_i32 v23, v22, 0, 1
; GFX12-NEXT: v_bfe_i32 v22, v21, 0, 1
; GFX12-NEXT: v_bfe_i32 v21, v20, 0, 1
@@ -2837,16 +2837,16 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: v_bfe_i32 v18, v18, 0, 1
; GFX12-NEXT: v_bfe_i32 v17, v17, 0, 1
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:80
-; GFX12-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:64
-; GFX12-NEXT: v_mov_b32_e32 v16, s4
+; GFX12-NEXT: global_store_b128 v32, v[24:27], s[4:5] offset:80
+; GFX12-NEXT: global_store_b128 v32, v[28:31], s[4:5] offset:64
+; GFX12-NEXT: v_mov_b32_e32 v16, s2
; GFX12-NEXT: s_clause 0x5
-; GFX12-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v32, v[0:3], s[0:1]
-; GFX12-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:96
+; GFX12-NEXT: global_store_b128 v32, v[12:15], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v32, v[8:11], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v32, v[4:7], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v32, v[0:3], s[4:5]
+; GFX12-NEXT: global_store_b128 v32, v[20:23], s[4:5] offset:112
+; GFX12-NEXT: global_store_b128 v32, v[16:19], s[4:5] offset:96
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3025,99 +3025,99 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
;
; GFX8-LABEL: constant_zextload_v64i1_to_v64i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_lshr_b32 s6, s3, 24
-; GFX8-NEXT: s_lshr_b32 s8, s2, 24
-; GFX8-NEXT: s_bfe_u32 s4, s2, 0x10018
-; GFX8-NEXT: s_bfe_u32 s5, s3, 0x10018
-; GFX8-NEXT: s_and_b32 s7, s3, 1
-; GFX8-NEXT: s_and_b32 s9, s2, 1
-; GFX8-NEXT: s_bfe_u32 s12, s2, 0x10013
-; GFX8-NEXT: s_bfe_u32 s13, s2, 0x10012
-; GFX8-NEXT: s_bfe_u32 s14, s2, 0x10011
-; GFX8-NEXT: s_bfe_u32 s15, s2, 0x10010
-; GFX8-NEXT: s_bfe_u32 s16, s2, 0x10017
-; GFX8-NEXT: s_bfe_u32 s17, s2, 0x10016
-; GFX8-NEXT: s_bfe_u32 s18, s2, 0x10015
-; GFX8-NEXT: s_bfe_u32 s19, s2, 0x10014
-; GFX8-NEXT: s_bfe_u32 s20, s3, 0x10013
-; GFX8-NEXT: s_bfe_u32 s21, s3, 0x10012
-; GFX8-NEXT: s_bfe_u32 s22, s3, 0x10011
-; GFX8-NEXT: s_bfe_u32 s23, s3, 0x10010
-; GFX8-NEXT: s_bfe_u32 s10, s3, 0x10017
-; GFX8-NEXT: s_bfe_u32 s11, s3, 0x10016
-; GFX8-NEXT: s_bfe_u32 s24, s3, 0x10015
-; GFX8-NEXT: s_bfe_u32 s25, s3, 0x10014
+; GFX8-NEXT: s_lshr_b32 s6, s1, 24
+; GFX8-NEXT: s_lshr_b32 s8, s0, 24
+; GFX8-NEXT: s_bfe_u32 s2, s0, 0x10018
+; GFX8-NEXT: s_bfe_u32 s3, s1, 0x10018
+; GFX8-NEXT: s_and_b32 s7, s1, 1
+; GFX8-NEXT: s_and_b32 s9, s0, 1
+; GFX8-NEXT: s_bfe_u32 s12, s0, 0x10013
+; GFX8-NEXT: s_bfe_u32 s13, s0, 0x10012
+; GFX8-NEXT: s_bfe_u32 s14, s0, 0x10011
+; GFX8-NEXT: s_bfe_u32 s15, s0, 0x10010
+; GFX8-NEXT: s_bfe_u32 s16, s0, 0x10017
+; GFX8-NEXT: s_bfe_u32 s17, s0, 0x10016
+; GFX8-NEXT: s_bfe_u32 s18, s0, 0x10015
+; GFX8-NEXT: s_bfe_u32 s19, s0, 0x10014
+; GFX8-NEXT: s_bfe_u32 s20, s1, 0x10013
+; GFX8-NEXT: s_bfe_u32 s21, s1, 0x10012
+; GFX8-NEXT: s_bfe_u32 s22, s1, 0x10011
+; GFX8-NEXT: s_bfe_u32 s23, s1, 0x10010
+; GFX8-NEXT: s_bfe_u32 s10, s1, 0x10017
+; GFX8-NEXT: s_bfe_u32 s11, s1, 0x10016
+; GFX8-NEXT: s_bfe_u32 s24, s1, 0x10015
+; GFX8-NEXT: s_bfe_u32 s25, s1, 0x10014
; GFX8-NEXT: v_mov_b32_e32 v25, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 0xd0
+; GFX8-NEXT: s_add_u32 s10, s4, 0xd0
; GFX8-NEXT: v_mov_b32_e32 v24, s11
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v27, s11
; GFX8-NEXT: v_mov_b32_e32 v26, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 0xc0
+; GFX8-NEXT: s_add_u32 s10, s4, 0xc0
; GFX8-NEXT: v_mov_b32_e32 v22, s25
; GFX8-NEXT: v_mov_b32_e32 v23, s24
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v27, s11
; GFX8-NEXT: v_mov_b32_e32 v26, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 0x50
+; GFX8-NEXT: s_add_u32 s10, s4, 0x50
; GFX8-NEXT: v_mov_b32_e32 v22, s23
; GFX8-NEXT: v_mov_b32_e32 v23, s22
; GFX8-NEXT: v_mov_b32_e32 v24, s21
; GFX8-NEXT: v_mov_b32_e32 v25, s20
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v27, s11
; GFX8-NEXT: v_mov_b32_e32 v26, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 64
+; GFX8-NEXT: s_add_u32 s10, s4, 64
; GFX8-NEXT: v_mov_b32_e32 v22, s19
; GFX8-NEXT: v_mov_b32_e32 v23, s18
; GFX8-NEXT: v_mov_b32_e32 v24, s17
; GFX8-NEXT: v_mov_b32_e32 v25, s16
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v27, s11
; GFX8-NEXT: v_mov_b32_e32 v26, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 48
+; GFX8-NEXT: s_add_u32 s10, s4, 48
; GFX8-NEXT: v_mov_b32_e32 v22, s15
; GFX8-NEXT: v_mov_b32_e32 v23, s14
; GFX8-NEXT: v_mov_b32_e32 v24, s13
; GFX8-NEXT: v_mov_b32_e32 v25, s12
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 13, s2
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 13, s0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
-; GFX8-NEXT: v_lshrrev_b16_e64 v19, 12, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v22, 7, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v19, 12, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v22, 7, s1
; GFX8-NEXT: v_mov_b32_e32 v25, s11
-; GFX8-NEXT: v_lshrrev_b16_e64 v20, 14, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v20, 14, s0
; GFX8-NEXT: v_and_b32_e32 v21, 1, v0
; GFX8-NEXT: v_and_b32_e32 v27, 1, v22
-; GFX8-NEXT: v_lshrrev_b16_e64 v22, 1, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v22, 1, s1
; GFX8-NEXT: v_mov_b32_e32 v24, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 32
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 9, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v2, 11, s2
+; GFX8-NEXT: s_add_u32 s10, s4, 32
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 9, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v2, 11, s0
; GFX8-NEXT: v_and_b32_e32 v28, 1, v22
; GFX8-NEXT: v_and_b32_e32 v22, 1, v20
-; GFX8-NEXT: v_lshrrev_b16_e64 v23, 15, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v23, 15, s0
; GFX8-NEXT: v_and_b32_e32 v21, 0xffff, v21
; GFX8-NEXT: v_and_b32_e32 v20, 1, v19
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v14, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v16, 10, s2
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v14, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v16, 10, s0
; GFX8-NEXT: v_and_b32_e32 v17, 1, v1
; GFX8-NEXT: v_and_b32_e32 v18, 1, v2
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 5, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 5, s0
; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[20:23]
-; GFX8-NEXT: v_lshrrev_b16_e64 v19, 3, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v19, 3, s1
; GFX8-NEXT: v_mov_b32_e32 v25, 1
; GFX8-NEXT: v_mov_b32_e32 v21, s11
; GFX8-NEXT: v_and_b32_e32 v12, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 7, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 7, s0
; GFX8-NEXT: v_and_b32_e32 v23, 1, v19
; GFX8-NEXT: v_and_b32_e32 v19, 0xffff, v18
; GFX8-NEXT: v_and_b32_e32 v18, 1, v16
@@ -3129,129 +3129,129 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
; GFX8-NEXT: v_and_b32_e32 v20, 1, v14
; GFX8-NEXT: v_lshrrev_b16_e64 v14, 1, s6
-; GFX8-NEXT: s_add_u32 s10, s0, 16
+; GFX8-NEXT: s_add_u32 s10, s4, 16
; GFX8-NEXT: v_and_b32_e32 v17, 1, v14
; GFX8-NEXT: v_and_b32_e32 v14, 0xffff, v15
; GFX8-NEXT: v_lshrrev_b16_e64 v15, 3, s6
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
-; GFX8-NEXT: v_lshrrev_b16_e64 v11, 4, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v13, 6, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 1, s2
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
+; GFX8-NEXT: v_lshrrev_b16_e64 v11, 4, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v13, 6, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 1, s0
; GFX8-NEXT: v_and_b32_e32 v19, 1, v15
; GFX8-NEXT: v_mov_b32_e32 v16, s11
; GFX8-NEXT: v_and_b32_e32 v8, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 3, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 3, s0
; GFX8-NEXT: v_and_b32_e32 v13, 1, v13
; GFX8-NEXT: v_and_b32_e32 v12, 0xffff, v12
; GFX8-NEXT: v_and_b32_e32 v11, 1, v11
; GFX8-NEXT: v_mov_b32_e32 v15, s10
-; GFX8-NEXT: v_lshrrev_b16_e64 v9, 2, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v9, 2, s0
; GFX8-NEXT: v_and_b32_e32 v10, 1, v0
; GFX8-NEXT: flat_store_dwordx4 v[15:16], v[11:14]
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 13, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 13, s1
; GFX8-NEXT: v_lshrrev_b16_e64 v11, 5, s8
-; GFX8-NEXT: v_mov_b32_e32 v13, s1
+; GFX8-NEXT: v_mov_b32_e32 v13, s5
; GFX8-NEXT: v_and_b32_e32 v15, 1, v11
; GFX8-NEXT: v_and_b32_e32 v11, 0xffff, v10
; GFX8-NEXT: v_and_b32_e32 v10, 1, v9
; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v8
; GFX8-NEXT: v_mov_b32_e32 v8, s9
-; GFX8-NEXT: v_mov_b32_e32 v12, s0
+; GFX8-NEXT: v_mov_b32_e32 v12, s4
; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
-; GFX8-NEXT: s_add_u32 s10, s0, 0xb0
+; GFX8-NEXT: s_add_u32 s10, s4, 0xb0
; GFX8-NEXT: v_lshrrev_b16_e64 v8, 1, s8
-; GFX8-NEXT: v_lshrrev_b16_e64 v5, 12, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v5, 12, s1
; GFX8-NEXT: v_and_b32_e32 v6, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v7, 14, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 9, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v7, 14, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 9, s1
; GFX8-NEXT: v_and_b32_e32 v11, 1, v8
; GFX8-NEXT: v_lshrrev_b16_e64 v8, 3, s8
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v9, s10
; GFX8-NEXT: v_and_b32_e32 v2, 1, v0
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 11, s1
; GFX8-NEXT: v_and_b32_e32 v13, 1, v8
; GFX8-NEXT: v_and_b32_e32 v7, 1, v7
-; GFX8-NEXT: v_lshrrev_b16_e64 v8, 15, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v8, 15, s1
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX8-NEXT: v_and_b32_e32 v5, 1, v5
; GFX8-NEXT: v_mov_b32_e32 v10, s11
-; GFX8-NEXT: v_lshrrev_b16_e64 v3, 10, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 11, s3
+; GFX8-NEXT: s_add_u32 s0, s4, 0xa0
+; GFX8-NEXT: v_lshrrev_b16_e64 v3, 10, s1
+; GFX8-NEXT: v_and_b32_e32 v4, 1, v0
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 4, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v26, 6, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v22, 2, s1
; GFX8-NEXT: v_lshrrev_b16_e64 v18, 2, s6
; GFX8-NEXT: v_lshrrev_b16_e64 v14, 4, s8
; GFX8-NEXT: v_lshrrev_b16_e64 v16, 6, s8
; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[5:8]
-; GFX8-NEXT: s_add_u32 s2, s0, 0xa0
+; GFX8-NEXT: v_and_b32_e32 v10, 1, v16
; GFX8-NEXT: v_and_b32_e32 v7, 0xffff, v13
; GFX8-NEXT: v_and_b32_e32 v13, 0xffff, v17
; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v2
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: v_and_b32_e32 v4, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 4, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v26, 6, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v22, 2, s3
-; GFX8-NEXT: v_and_b32_e32 v10, 1, v16
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_and_b32_e32 v8, 1, v14
; GFX8-NEXT: v_and_b32_e32 v14, 1, v18
; GFX8-NEXT: v_and_b32_e32 v18, 1, v3
; GFX8-NEXT: v_and_b32_sdwa v16, v2, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v15
; GFX8-NEXT: v_and_b32_e32 v15, 0xffff, v19
; GFX8-NEXT: v_and_b32_e32 v19, 0xffff, v4
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0x90
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0x90
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[16:19]
; GFX8-NEXT: v_and_b32_e32 v21, 0xffff, v1
; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v20
; GFX8-NEXT: v_and_b32_e32 v20, 1, v0
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_lshrrev_b16_e64 v4, 6, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0x80
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_add_u32 s0, s4, 0x80
; GFX8-NEXT: v_and_b32_e32 v18, 1, v4
; GFX8-NEXT: v_and_b32_e32 v4, 0xffff, v23
; GFX8-NEXT: v_and_b32_e32 v3, 1, v22
; GFX8-NEXT: v_and_b32_e32 v23, 0xffff, v27
; GFX8-NEXT: v_and_b32_e32 v22, 1, v26
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[20:23]
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v28
-; GFX8-NEXT: v_mov_b32_e32 v21, s3
+; GFX8-NEXT: v_mov_b32_e32 v21, s1
; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: v_mov_b32_e32 v20, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 0xf0
+; GFX8-NEXT: v_mov_b32_e32 v20, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0xf0
; GFX8-NEXT: v_lshrrev_b16_e64 v24, 4, s6
; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[1:4]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_lshrrev_b16_e64 v19, 7, s6
; GFX8-NEXT: v_and_b32_e32 v16, 1, v24
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0xe0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_add_u32 s0, s4, 0xe0
; GFX8-NEXT: v_lshrrev_b16_e64 v12, 2, s8
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[16:19]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_and_b32_e32 v6, 1, v12
-; GFX8-NEXT: v_mov_b32_e32 v12, s5
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0x70
+; GFX8-NEXT: v_mov_b32_e32 v12, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_add_u32 s0, s4, 0x70
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff, v11
; GFX8-NEXT: v_lshrrev_b16_e64 v11, 7, s8
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_add_u32 s0, s0, 0x60
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_add_u32 s0, s4, 0x60
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX8-NEXT: s_endpgm
@@ -3444,113 +3444,113 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_zextload_v64i1_to_v64i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v2, 13, s2
-; GFX12-NEXT: s_lshr_b32 s4, s3, 24
+; GFX12-NEXT: v_lshrrev_b16 v2, 13, s0
+; GFX12-NEXT: s_lshr_b32 s2, s1, 24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-NEXT: v_lshrrev_b16 v10, 13, s3
-; GFX12-NEXT: v_lshrrev_b16 v3, 9, s2
+; GFX12-NEXT: v_lshrrev_b16 v10, 13, s1
+; GFX12-NEXT: v_lshrrev_b16 v3, 9, s0
; GFX12-NEXT: v_and_b32_e32 v45, 1, v2
-; GFX12-NEXT: v_lshrrev_b16 v2, 1, s4
-; GFX12-NEXT: v_lshrrev_b16 v4, 11, s2
-; GFX12-NEXT: v_lshrrev_b16 v5, 5, s2
-; GFX12-NEXT: v_lshrrev_b16 v7, 1, s2
-; GFX12-NEXT: v_lshrrev_b16 v8, 3, s2
-; GFX12-NEXT: v_lshrrev_b16 v12, 11, s3
-; GFX12-NEXT: v_lshrrev_b16 v14, 7, s3
-; GFX12-NEXT: v_lshrrev_b16 v18, 5, s4
-; GFX12-NEXT: s_lshr_b32 s5, s2, 24
-; GFX12-NEXT: s_and_b32 s6, s3, 1
-; GFX12-NEXT: s_bfe_u32 s14, s3, 0x10012
+; GFX12-NEXT: v_lshrrev_b16 v2, 1, s2
+; GFX12-NEXT: v_lshrrev_b16 v4, 11, s0
+; GFX12-NEXT: v_lshrrev_b16 v5, 5, s0
+; GFX12-NEXT: v_lshrrev_b16 v7, 1, s0
+; GFX12-NEXT: v_lshrrev_b16 v8, 3, s0
+; GFX12-NEXT: v_lshrrev_b16 v12, 11, s1
+; GFX12-NEXT: v_lshrrev_b16 v14, 7, s1
+; GFX12-NEXT: v_lshrrev_b16 v18, 5, s2
+; GFX12-NEXT: s_lshr_b32 s3, s0, 24
+; GFX12-NEXT: s_and_b32 s6, s1, 1
+; GFX12-NEXT: s_bfe_u32 s14, s1, 0x10012
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v55, s14 :: v_dual_and_b32 v36, 1, v10
; GFX12-NEXT: v_and_b32_e32 v10, 1, v2
-; GFX12-NEXT: v_lshrrev_b16 v2, 3, s4
-; GFX12-NEXT: v_lshrrev_b16 v6, 7, s2
-; GFX12-NEXT: v_lshrrev_b16 v11, 9, s3
-; GFX12-NEXT: v_lshrrev_b16 v13, 5, s3
-; GFX12-NEXT: v_lshrrev_b16 v15, 1, s3
-; GFX12-NEXT: v_lshrrev_b16 v16, 3, s3
+; GFX12-NEXT: v_lshrrev_b16 v2, 3, s2
+; GFX12-NEXT: v_lshrrev_b16 v6, 7, s0
+; GFX12-NEXT: v_lshrrev_b16 v11, 9, s1
+; GFX12-NEXT: v_lshrrev_b16 v13, 5, s1
+; GFX12-NEXT: v_lshrrev_b16 v15, 1, s1
+; GFX12-NEXT: v_lshrrev_b16 v16, 3, s1
; GFX12-NEXT: v_and_b32_e32 v43, 1, v4
-; GFX12-NEXT: v_lshrrev_b16 v4, 3, s5
-; GFX12-NEXT: s_bfe_u32 s19, s3, 0x10014
+; GFX12-NEXT: v_lshrrev_b16 v4, 3, s3
+; GFX12-NEXT: s_bfe_u32 s19, s1, 0x10014
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v49, s19 :: v_dual_and_b32 v42, 1, v3
-; GFX12-NEXT: v_lshrrev_b16 v3, 5, s5
-; GFX12-NEXT: s_bfe_u32 s13, s3, 0x10013
-; GFX12-NEXT: v_lshrrev_b16 v29, 12, s3
-; GFX12-NEXT: v_lshrrev_b16 v30, 14, s3
-; GFX12-NEXT: v_lshrrev_b16 v31, 15, s3
-; GFX12-NEXT: v_lshrrev_b16 v25, 8, s3
-; GFX12-NEXT: v_lshrrev_b16 v26, 10, s3
-; GFX12-NEXT: v_lshrrev_b16 v21, 4, s3
-; GFX12-NEXT: v_lshrrev_b16 v22, 6, s3
+; GFX12-NEXT: v_lshrrev_b16 v3, 5, s3
+; GFX12-NEXT: s_bfe_u32 s13, s1, 0x10013
+; GFX12-NEXT: v_lshrrev_b16 v29, 12, s1
+; GFX12-NEXT: v_lshrrev_b16 v30, 14, s1
+; GFX12-NEXT: v_lshrrev_b16 v31, 15, s1
+; GFX12-NEXT: v_lshrrev_b16 v25, 8, s1
+; GFX12-NEXT: v_lshrrev_b16 v26, 10, s1
+; GFX12-NEXT: v_lshrrev_b16 v21, 4, s1
+; GFX12-NEXT: v_lshrrev_b16 v22, 6, s1
; GFX12-NEXT: v_dual_mov_b32 v56, s13 :: v_dual_and_b32 v27, 1, v12
-; GFX12-NEXT: v_lshrrev_b16 v19, 2, s3
+; GFX12-NEXT: v_lshrrev_b16 v19, 2, s1
; GFX12-NEXT: v_and_b32_e32 v12, 1, v2
-; GFX12-NEXT: v_lshrrev_b16 v2, 1, s5
-; GFX12-NEXT: s_and_b32 s7, s2, 1
-; GFX12-NEXT: s_bfe_u32 s15, s3, 0x10011
+; GFX12-NEXT: v_lshrrev_b16 v2, 1, s3
+; GFX12-NEXT: s_and_b32 s7, s0, 1
+; GFX12-NEXT: s_bfe_u32 s15, s1, 0x10011
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v54, s15 :: v_dual_and_b32 v35, 1, v8
-; GFX12-NEXT: v_lshrrev_b16 v8, 7, s5
-; GFX12-NEXT: s_bfe_u32 s16, s3, 0x10010
+; GFX12-NEXT: v_lshrrev_b16 v8, 7, s3
+; GFX12-NEXT: s_bfe_u32 s16, s1, 0x10010
; GFX12-NEXT: v_dual_mov_b32 v53, s16 :: v_dual_and_b32 v40, 1, v7
-; GFX12-NEXT: v_lshrrev_b16 v7, 2, s5
-; GFX12-NEXT: s_bfe_u32 s17, s3, 0x10017
-; GFX12-NEXT: s_bfe_u32 s18, s3, 0x10016
+; GFX12-NEXT: v_lshrrev_b16 v7, 2, s3
+; GFX12-NEXT: s_bfe_u32 s17, s1, 0x10017
+; GFX12-NEXT: s_bfe_u32 s18, s1, 0x10016
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v51, s18 :: v_dual_and_b32 v44, 1, v5
-; GFX12-NEXT: v_lshrrev_b16 v5, 4, s5
-; GFX12-NEXT: s_bfe_u32 s13, s2, 0x10015
+; GFX12-NEXT: v_lshrrev_b16 v5, 4, s3
+; GFX12-NEXT: s_bfe_u32 s13, s0, 0x10015
; GFX12-NEXT: v_and_b32_e32 v23, 1, v14
; GFX12-NEXT: v_and_b32_e32 v14, 1, v18
-; GFX12-NEXT: v_lshrrev_b16 v18, 6, s5
-; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10018
-; GFX12-NEXT: s_bfe_u32 s3, s3, 0x10015
+; GFX12-NEXT: v_lshrrev_b16 v18, 6, s3
+; GFX12-NEXT: s_bfe_u32 s3, s1, 0x10018
+; GFX12-NEXT: s_bfe_u32 s1, s1, 0x10015
; GFX12-NEXT: v_dual_mov_b32 v52, s17 :: v_dual_and_b32 v39, 1, v6
; GFX12-NEXT: v_and_b32_e32 v32, 1, v11
-; GFX12-NEXT: v_lshrrev_b16 v11, 2, s4
-; GFX12-NEXT: s_bfe_u32 s9, s2, 0x10012
+; GFX12-NEXT: v_lshrrev_b16 v11, 2, s2
+; GFX12-NEXT: s_bfe_u32 s9, s0, 0x10012
; GFX12-NEXT: v_and_b32_e32 v20, 1, v16
-; GFX12-NEXT: v_lshrrev_b16 v16, 7, s4
-; GFX12-NEXT: s_bfe_u32 s11, s2, 0x10010
+; GFX12-NEXT: v_lshrrev_b16 v16, 7, s2
+; GFX12-NEXT: s_bfe_u32 s11, s0, 0x10010
; GFX12-NEXT: v_and_b32_e32 v24, 1, v15
-; GFX12-NEXT: v_lshrrev_b16 v15, 6, s4
-; GFX12-NEXT: s_bfe_u32 s12, s2, 0x10017
-; GFX12-NEXT: v_mov_b32_e32 v50, s3
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10016
+; GFX12-NEXT: v_lshrrev_b16 v15, 6, s2
+; GFX12-NEXT: s_bfe_u32 s12, s0, 0x10017
+; GFX12-NEXT: v_mov_b32_e32 v50, s1
+; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10016
; GFX12-NEXT: v_and_b32_e32 v28, 1, v13
-; GFX12-NEXT: v_lshrrev_b16 v13, 4, s4
-; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10018
+; GFX12-NEXT: v_lshrrev_b16 v13, 4, s2
+; GFX12-NEXT: s_bfe_u32 s2, s0, 0x10018
; GFX12-NEXT: v_and_b32_e32 v6, 1, v3
; GFX12-NEXT: v_and_b32_e32 v3, 1, v4
-; GFX12-NEXT: v_lshrrev_b16 v17, 15, s2
-; GFX12-NEXT: v_lshrrev_b16 v1, 12, s2
-; GFX12-NEXT: v_lshrrev_b16 v9, 14, s2
-; GFX12-NEXT: v_lshrrev_b16 v33, 8, s2
-; GFX12-NEXT: v_lshrrev_b16 v41, 10, s2
-; GFX12-NEXT: v_lshrrev_b16 v38, 6, s2
-; GFX12-NEXT: v_lshrrev_b16 v34, 2, s2
-; GFX12-NEXT: v_lshrrev_b16 v37, 4, s2
-; GFX12-NEXT: s_bfe_u32 s8, s2, 0x10013
-; GFX12-NEXT: s_bfe_u32 s10, s2, 0x10011
-; GFX12-NEXT: s_bfe_u32 s2, s2, 0x10014
+; GFX12-NEXT: v_lshrrev_b16 v17, 15, s0
+; GFX12-NEXT: v_lshrrev_b16 v1, 12, s0
+; GFX12-NEXT: v_lshrrev_b16 v9, 14, s0
+; GFX12-NEXT: v_lshrrev_b16 v33, 8, s0
+; GFX12-NEXT: v_lshrrev_b16 v41, 10, s0
+; GFX12-NEXT: v_lshrrev_b16 v38, 6, s0
+; GFX12-NEXT: v_lshrrev_b16 v34, 2, s0
+; GFX12-NEXT: v_lshrrev_b16 v37, 4, s0
+; GFX12-NEXT: s_bfe_u32 s8, s0, 0x10013
+; GFX12-NEXT: s_bfe_u32 s10, s0, 0x10011
+; GFX12-NEXT: s_bfe_u32 s0, s0, 0x10014
; GFX12-NEXT: v_and_b32_e32 v2, 1, v2
; GFX12-NEXT: v_and_b32_e32 v21, 1, v21
; GFX12-NEXT: v_and_b32_e32 v29, 1, v29
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v0, v[49:52], s[0:1] offset:208
-; GFX12-NEXT: global_store_b128 v0, v[53:56], s[0:1] offset:192
+; GFX12-NEXT: global_store_b128 v0, v[49:52], s[4:5] offset:208
+; GFX12-NEXT: global_store_b128 v0, v[53:56], s[4:5] offset:192
; GFX12-NEXT: v_mov_b32_e32 v52, s12
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v3
; GFX12-NEXT: v_dual_mov_b32 v54, s10 :: v_dual_and_b32 v3, 1, v7
; GFX12-NEXT: v_dual_mov_b32 v56, s8 :: v_dual_and_b32 v7, 1, v18
-; GFX12-NEXT: v_dual_mov_b32 v49, s2 :: v_dual_mov_b32 v50, s13
-; GFX12-NEXT: v_mov_b32_e32 v51, s3
+; GFX12-NEXT: v_dual_mov_b32 v49, s0 :: v_dual_mov_b32 v50, s13
+; GFX12-NEXT: v_mov_b32_e32 v51, s1
; GFX12-NEXT: v_dual_mov_b32 v53, s11 :: v_dual_and_b32 v18, 0xffff, v24
; GFX12-NEXT: v_and_b32_e32 v24, 0xffff, v23
; GFX12-NEXT: v_and_b32_e32 v23, 1, v22
@@ -3583,28 +3583,28 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: v_dual_mov_b32 v33, s7 :: v_dual_and_b32 v14, 0xffff, v14
; GFX12-NEXT: v_and_b32_e32 v11, 1, v11
; GFX12-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX12-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_and_b32 v42, 0xffff, v42
+; GFX12-NEXT: v_dual_mov_b32 v9, s3 :: v_dual_and_b32 v42, 0xffff, v42
; GFX12-NEXT: v_and_b32_e32 v12, 0xffff, v12
; GFX12-NEXT: v_and_b32_e32 v8, 0xffff, v8
; GFX12-NEXT: v_and_b32_e32 v5, 1, v5
; GFX12-NEXT: v_and_b32_e32 v37, 1, v37
-; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v6, 0xffff, v6
+; GFX12-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_and_b32 v6, 0xffff, v6
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX12-NEXT: s_clause 0xd
-; GFX12-NEXT: global_store_b128 v0, v[49:52], s[0:1] offset:80
-; GFX12-NEXT: global_store_b128 v0, v[53:56], s[0:1] offset:64
-; GFX12-NEXT: global_store_b128 v0, v[45:48], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v0, v[41:44], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v0, v[37:40], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v0, v[33:36], s[0:1]
-; GFX12-NEXT: global_store_b128 v0, v[29:32], s[0:1] offset:176
-; GFX12-NEXT: global_store_b128 v0, v[25:28], s[0:1] offset:160
-; GFX12-NEXT: global_store_b128 v0, v[21:24], s[0:1] offset:144
-; GFX12-NEXT: global_store_b128 v0, v[17:20], s[0:1] offset:128
-; GFX12-NEXT: global_store_b128 v0, v[13:16], s[0:1] offset:240
-; GFX12-NEXT: global_store_b128 v0, v[9:12], s[0:1] offset:224
-; GFX12-NEXT: global_store_b128 v0, v[5:8], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v0, v[1:4], s[0:1] offset:96
+; GFX12-NEXT: global_store_b128 v0, v[49:52], s[4:5] offset:80
+; GFX12-NEXT: global_store_b128 v0, v[53:56], s[4:5] offset:64
+; GFX12-NEXT: global_store_b128 v0, v[45:48], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v0, v[41:44], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v0, v[37:40], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v0, v[33:36], s[4:5]
+; GFX12-NEXT: global_store_b128 v0, v[29:32], s[4:5] offset:176
+; GFX12-NEXT: global_store_b128 v0, v[25:28], s[4:5] offset:160
+; GFX12-NEXT: global_store_b128 v0, v[21:24], s[4:5] offset:144
+; GFX12-NEXT: global_store_b128 v0, v[17:20], s[4:5] offset:128
+; GFX12-NEXT: global_store_b128 v0, v[13:16], s[4:5] offset:240
+; GFX12-NEXT: global_store_b128 v0, v[9:12], s[4:5] offset:224
+; GFX12-NEXT: global_store_b128 v0, v[5:8], s[4:5] offset:112
+; GFX12-NEXT: global_store_b128 v0, v[1:4], s[4:5] offset:96
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3783,84 +3783,84 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
;
; GFX8-LABEL: constant_sextload_v64i1_to_v64i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b16_e64 v18, 12, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v19, 13, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v20, 14, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v21, 15, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v14, 8, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v15, 9, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v16, 10, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v17, 11, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v10, 4, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v11, 5, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v12, 6, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v13, 7, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v7, 1, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v8, 2, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v9, 3, s2
-; GFX8-NEXT: s_lshr_b32 s7, s3, 24
-; GFX8-NEXT: s_lshr_b32 s8, s2, 24
-; GFX8-NEXT: s_bfe_i32 s4, s2, 0x10018
-; GFX8-NEXT: s_bfe_i32 s5, s3, 0x10018
-; GFX8-NEXT: s_bfe_i32 s6, s3, 0x10000
-; GFX8-NEXT: s_bfe_i32 s9, s2, 0x10000
-; GFX8-NEXT: s_bfe_i32 s12, s2, 0x10013
-; GFX8-NEXT: s_bfe_i32 s13, s2, 0x10012
-; GFX8-NEXT: s_bfe_i32 s14, s2, 0x10011
-; GFX8-NEXT: s_bfe_i32 s15, s2, 0x10010
-; GFX8-NEXT: s_bfe_i32 s16, s2, 0x10017
-; GFX8-NEXT: s_bfe_i32 s17, s2, 0x10016
-; GFX8-NEXT: s_bfe_i32 s18, s2, 0x10015
-; GFX8-NEXT: s_bfe_i32 s2, s2, 0x10014
-; GFX8-NEXT: s_bfe_i32 s19, s3, 0x10013
-; GFX8-NEXT: s_bfe_i32 s20, s3, 0x10012
-; GFX8-NEXT: s_bfe_i32 s21, s3, 0x10011
-; GFX8-NEXT: s_bfe_i32 s22, s3, 0x10010
-; GFX8-NEXT: s_bfe_i32 s10, s3, 0x10017
-; GFX8-NEXT: s_bfe_i32 s11, s3, 0x10016
-; GFX8-NEXT: s_bfe_i32 s23, s3, 0x10015
-; GFX8-NEXT: s_bfe_i32 s24, s3, 0x10014
+; GFX8-NEXT: v_lshrrev_b16_e64 v18, 12, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v19, 13, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v20, 14, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v21, 15, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v14, 8, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v15, 9, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v16, 10, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v17, 11, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v10, 4, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v11, 5, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v12, 6, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v13, 7, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v7, 1, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v8, 2, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v9, 3, s0
+; GFX8-NEXT: s_lshr_b32 s7, s1, 24
+; GFX8-NEXT: s_lshr_b32 s8, s0, 24
+; GFX8-NEXT: s_bfe_i32 s2, s0, 0x10018
+; GFX8-NEXT: s_bfe_i32 s3, s1, 0x10018
+; GFX8-NEXT: s_bfe_i32 s6, s1, 0x10000
+; GFX8-NEXT: s_bfe_i32 s9, s0, 0x10000
+; GFX8-NEXT: s_bfe_i32 s12, s0, 0x10013
+; GFX8-NEXT: s_bfe_i32 s13, s0, 0x10012
+; GFX8-NEXT: s_bfe_i32 s14, s0, 0x10011
+; GFX8-NEXT: s_bfe_i32 s15, s0, 0x10010
+; GFX8-NEXT: s_bfe_i32 s16, s0, 0x10017
+; GFX8-NEXT: s_bfe_i32 s17, s0, 0x10016
+; GFX8-NEXT: s_bfe_i32 s18, s0, 0x10015
+; GFX8-NEXT: s_bfe_i32 s0, s0, 0x10014
+; GFX8-NEXT: s_bfe_i32 s19, s1, 0x10013
+; GFX8-NEXT: s_bfe_i32 s20, s1, 0x10012
+; GFX8-NEXT: s_bfe_i32 s21, s1, 0x10011
+; GFX8-NEXT: s_bfe_i32 s22, s1, 0x10010
+; GFX8-NEXT: s_bfe_i32 s10, s1, 0x10017
+; GFX8-NEXT: s_bfe_i32 s11, s1, 0x10016
+; GFX8-NEXT: s_bfe_i32 s23, s1, 0x10015
+; GFX8-NEXT: s_bfe_i32 s24, s1, 0x10014
; GFX8-NEXT: v_mov_b32_e32 v25, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 0xd0
+; GFX8-NEXT: s_add_u32 s10, s4, 0xd0
; GFX8-NEXT: v_mov_b32_e32 v24, s11
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v27, s11
; GFX8-NEXT: v_mov_b32_e32 v26, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 0xc0
+; GFX8-NEXT: s_add_u32 s10, s4, 0xc0
; GFX8-NEXT: v_mov_b32_e32 v22, s24
; GFX8-NEXT: v_mov_b32_e32 v23, s23
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v27, s11
; GFX8-NEXT: v_mov_b32_e32 v26, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 0x50
+; GFX8-NEXT: s_add_u32 s10, s4, 0x50
; GFX8-NEXT: v_mov_b32_e32 v22, s22
; GFX8-NEXT: v_mov_b32_e32 v23, s21
; GFX8-NEXT: v_mov_b32_e32 v24, s20
; GFX8-NEXT: v_mov_b32_e32 v25, s19
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v27, s11
; GFX8-NEXT: v_mov_b32_e32 v26, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 64
-; GFX8-NEXT: v_mov_b32_e32 v22, s2
+; GFX8-NEXT: s_add_u32 s10, s4, 64
+; GFX8-NEXT: v_mov_b32_e32 v22, s0
; GFX8-NEXT: v_mov_b32_e32 v23, s18
; GFX8-NEXT: v_mov_b32_e32 v24, s17
; GFX8-NEXT: v_mov_b32_e32 v25, s16
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v27, s11
; GFX8-NEXT: v_mov_b32_e32 v26, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 48
+; GFX8-NEXT: s_add_u32 s10, s4, 48
; GFX8-NEXT: v_mov_b32_e32 v22, s15
; GFX8-NEXT: v_mov_b32_e32 v23, s14
; GFX8-NEXT: v_mov_b32_e32 v24, s13
; GFX8-NEXT: v_mov_b32_e32 v25, s12
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_bfe_i32 v21, v21, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v23, s11
@@ -3868,58 +3868,58 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX8-NEXT: v_bfe_i32 v19, v19, 0, 1
; GFX8-NEXT: v_bfe_i32 v18, v18, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v22, s10
-; GFX8-NEXT: s_add_u32 s2, s0, 32
-; GFX8-NEXT: v_lshrrev_b16_e64 v3, 12, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v4, 13, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v5, 14, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v6, 15, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 8, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 9, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v2, 10, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v24, 11, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v26, 4, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v27, 5, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v28, 6, s3
+; GFX8-NEXT: s_add_u32 s0, s4, 32
+; GFX8-NEXT: v_lshrrev_b16_e64 v3, 12, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v4, 13, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v5, 14, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v6, 15, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 8, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 9, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v2, 10, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v24, 11, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v26, 4, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v27, 5, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v28, 6, s1
; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[18:21]
-; GFX8-NEXT: v_lshrrev_b16_e64 v22, 7, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v23, 1, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v20, 2, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v21, 3, s3
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v19, s3
-; GFX8-NEXT: v_mov_b32_e32 v18, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 16
+; GFX8-NEXT: v_lshrrev_b16_e64 v22, 7, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v23, 1, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v20, 2, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v21, 3, s1
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v19, s1
+; GFX8-NEXT: v_mov_b32_e32 v18, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
; GFX8-NEXT: v_bfe_i32 v17, v17, 0, 1
; GFX8-NEXT: v_bfe_i32 v16, v16, 0, 1
; GFX8-NEXT: v_bfe_i32 v15, v15, 0, 1
; GFX8-NEXT: v_bfe_i32 v14, v14, 0, 1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[14:17]
; GFX8-NEXT: v_bfe_i32 v13, v13, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v15, s3
+; GFX8-NEXT: v_mov_b32_e32 v15, s1
; GFX8-NEXT: v_bfe_i32 v12, v12, 0, 1
; GFX8-NEXT: v_bfe_i32 v11, v11, 0, 1
; GFX8-NEXT: v_bfe_i32 v10, v10, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v14, s2
+; GFX8-NEXT: v_mov_b32_e32 v14, s0
; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[10:13]
-; GFX8-NEXT: s_add_u32 s2, s0, 0xb0
-; GFX8-NEXT: v_mov_b32_e32 v12, s1
+; GFX8-NEXT: s_add_u32 s0, s4, 0xb0
+; GFX8-NEXT: v_mov_b32_e32 v12, s5
; GFX8-NEXT: v_bfe_i32 v10, v9, 0, 1
; GFX8-NEXT: v_bfe_i32 v9, v8, 0, 1
; GFX8-NEXT: v_bfe_i32 v8, v7, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v7, s9
-; GFX8-NEXT: v_mov_b32_e32 v11, s0
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v11, s4
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[11:12], v[7:10]
; GFX8-NEXT: v_lshrrev_b16_e64 v11, 6, s8
-; GFX8-NEXT: v_mov_b32_e32 v8, s3
+; GFX8-NEXT: v_mov_b32_e32 v8, s1
; GFX8-NEXT: v_lshrrev_b16_e64 v10, 5, s8
; GFX8-NEXT: v_bfe_i32 v6, v6, 0, 1
; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 1
; GFX8-NEXT: v_bfe_i32 v4, v4, 0, 1
; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v7, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 0xa0
+; GFX8-NEXT: v_mov_b32_e32 v7, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0xa0
; GFX8-NEXT: v_lshrrev_b16_e64 v13, 4, s8
; GFX8-NEXT: v_lshrrev_b16_e64 v12, 1, s8
; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[3:6]
@@ -3929,21 +3929,21 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX8-NEXT: v_bfe_i32 v7, v10, 0, 1
; GFX8-NEXT: v_bfe_i32 v11, v1, 0, 1
; GFX8-NEXT: v_bfe_i32 v10, v0, 0, 1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_bfe_i32 v5, v4, 0, 1
; GFX8-NEXT: v_bfe_i32 v4, v3, 0, 1
; GFX8-NEXT: v_bfe_i32 v3, v12, 0, 1
; GFX8-NEXT: v_bfe_i32 v6, v13, 0, 1
; GFX8-NEXT: v_bfe_i32 v13, v24, 0, 1
; GFX8-NEXT: v_bfe_i32 v12, v2, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0x90
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_add_u32 s0, s4, 0x90
; GFX8-NEXT: v_lshrrev_b16_e64 v19, 5, s7
; GFX8-NEXT: v_lshrrev_b16_e64 v15, 2, s7
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[10:13]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_bfe_i32 v12, v15, 0, 1
; GFX8-NEXT: v_bfe_i32 v15, v19, 0, 1
; GFX8-NEXT: v_bfe_i32 v19, v23, 0, 1
@@ -3951,48 +3951,48 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX8-NEXT: v_bfe_i32 v24, v28, 0, 1
; GFX8-NEXT: v_bfe_i32 v23, v27, 0, 1
; GFX8-NEXT: v_bfe_i32 v22, v26, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0x80
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_add_u32 s0, s4, 0x80
; GFX8-NEXT: v_lshrrev_b16_e64 v18, 4, s7
; GFX8-NEXT: v_lshrrev_b16_e64 v14, 1, s7
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[22:25]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_bfe_i32 v11, v14, 0, 1
; GFX8-NEXT: v_bfe_i32 v14, v18, 0, 1
; GFX8-NEXT: v_bfe_i32 v21, v21, 0, 1
; GFX8-NEXT: v_bfe_i32 v20, v20, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v18, s6
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0xf0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_add_u32 s0, s4, 0xf0
; GFX8-NEXT: v_lshrrev_b16_e64 v16, 6, s7
; GFX8-NEXT: v_lshrrev_b16_e64 v17, 7, s7
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[18:21]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_bfe_i32 v17, v17, 0, 1
; GFX8-NEXT: v_bfe_i32 v16, v16, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0xe0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_add_u32 s0, s4, 0xe0
; GFX8-NEXT: v_lshrrev_b16_e64 v2, 3, s7
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[14:17]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_bfe_i32 v13, v2, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v10, s5
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0x70
+; GFX8-NEXT: v_mov_b32_e32 v10, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_add_u32 s0, s4, 0x70
; GFX8-NEXT: v_lshrrev_b16_e64 v9, 7, s8
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[10:13]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_bfe_i32 v9, v9, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_add_u32 s0, s0, 0x60
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_add_u32 s0, s4, 0x60
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[6:9]
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GFX8-NEXT: s_endpgm
@@ -4244,82 +4244,82 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_sextload_v64i1_to_v64i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s5, s2, 24
-; GFX12-NEXT: v_lshrrev_b16 v28, 12, s3
-; GFX12-NEXT: v_lshrrev_b16 v29, 13, s3
-; GFX12-NEXT: v_lshrrev_b16 v30, 14, s3
-; GFX12-NEXT: v_lshrrev_b16 v31, 15, s3
-; GFX12-NEXT: v_lshrrev_b16 v24, 8, s3
-; GFX12-NEXT: v_lshrrev_b16 v25, 9, s3
-; GFX12-NEXT: v_lshrrev_b16 v26, 10, s3
-; GFX12-NEXT: v_lshrrev_b16 v27, 11, s3
-; GFX12-NEXT: v_lshrrev_b16 v20, 4, s3
-; GFX12-NEXT: v_lshrrev_b16 v21, 5, s3
-; GFX12-NEXT: v_lshrrev_b16 v22, 6, s3
-; GFX12-NEXT: v_lshrrev_b16 v23, 7, s3
-; GFX12-NEXT: v_lshrrev_b16 v17, 1, s3
-; GFX12-NEXT: v_lshrrev_b16 v18, 2, s3
-; GFX12-NEXT: v_lshrrev_b16 v19, 3, s3
-; GFX12-NEXT: s_lshr_b32 s4, s3, 24
-; GFX12-NEXT: v_lshrrev_b16 v4, 4, s5
-; GFX12-NEXT: v_lshrrev_b16 v5, 5, s5
-; GFX12-NEXT: v_lshrrev_b16 v6, 6, s5
-; GFX12-NEXT: v_lshrrev_b16 v1, 3, s5
-; GFX12-NEXT: v_lshrrev_b16 v2, 2, s5
-; GFX12-NEXT: v_lshrrev_b16 v7, 1, s5
-; GFX12-NEXT: v_lshrrev_b16 v44, 7, s5
-; GFX12-NEXT: s_bfe_i32 s5, s3, 0x10018
-; GFX12-NEXT: s_bfe_i32 s6, s3, 0x10000
-; GFX12-NEXT: s_bfe_i32 s13, s3, 0x10013
-; GFX12-NEXT: s_bfe_i32 s14, s3, 0x10012
-; GFX12-NEXT: s_bfe_i32 s15, s3, 0x10011
-; GFX12-NEXT: s_bfe_i32 s16, s3, 0x10010
-; GFX12-NEXT: s_bfe_i32 s17, s3, 0x10017
-; GFX12-NEXT: s_bfe_i32 s18, s3, 0x10016
-; GFX12-NEXT: s_bfe_i32 s19, s3, 0x10014
-; GFX12-NEXT: s_bfe_i32 s3, s3, 0x10015
+; GFX12-NEXT: s_lshr_b32 s3, s0, 24
+; GFX12-NEXT: v_lshrrev_b16 v28, 12, s1
+; GFX12-NEXT: v_lshrrev_b16 v29, 13, s1
+; GFX12-NEXT: v_lshrrev_b16 v30, 14, s1
+; GFX12-NEXT: v_lshrrev_b16 v31, 15, s1
+; GFX12-NEXT: v_lshrrev_b16 v24, 8, s1
+; GFX12-NEXT: v_lshrrev_b16 v25, 9, s1
+; GFX12-NEXT: v_lshrrev_b16 v26, 10, s1
+; GFX12-NEXT: v_lshrrev_b16 v27, 11, s1
+; GFX12-NEXT: v_lshrrev_b16 v20, 4, s1
+; GFX12-NEXT: v_lshrrev_b16 v21, 5, s1
+; GFX12-NEXT: v_lshrrev_b16 v22, 6, s1
+; GFX12-NEXT: v_lshrrev_b16 v23, 7, s1
+; GFX12-NEXT: v_lshrrev_b16 v17, 1, s1
+; GFX12-NEXT: v_lshrrev_b16 v18, 2, s1
+; GFX12-NEXT: v_lshrrev_b16 v19, 3, s1
+; GFX12-NEXT: s_lshr_b32 s2, s1, 24
+; GFX12-NEXT: v_lshrrev_b16 v4, 4, s3
+; GFX12-NEXT: v_lshrrev_b16 v5, 5, s3
+; GFX12-NEXT: v_lshrrev_b16 v6, 6, s3
+; GFX12-NEXT: v_lshrrev_b16 v1, 3, s3
+; GFX12-NEXT: v_lshrrev_b16 v2, 2, s3
+; GFX12-NEXT: v_lshrrev_b16 v7, 1, s3
+; GFX12-NEXT: v_lshrrev_b16 v44, 7, s3
+; GFX12-NEXT: s_bfe_i32 s3, s1, 0x10018
+; GFX12-NEXT: s_bfe_i32 s6, s1, 0x10000
+; GFX12-NEXT: s_bfe_i32 s13, s1, 0x10013
+; GFX12-NEXT: s_bfe_i32 s14, s1, 0x10012
+; GFX12-NEXT: s_bfe_i32 s15, s1, 0x10011
+; GFX12-NEXT: s_bfe_i32 s16, s1, 0x10010
+; GFX12-NEXT: s_bfe_i32 s17, s1, 0x10017
+; GFX12-NEXT: s_bfe_i32 s18, s1, 0x10016
+; GFX12-NEXT: s_bfe_i32 s19, s1, 0x10014
+; GFX12-NEXT: s_bfe_i32 s1, s1, 0x10015
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v56, 0 :: v_dual_mov_b32 v49, s3
+; GFX12-NEXT: v_dual_mov_b32 v56, 0 :: v_dual_mov_b32 v49, s1
; GFX12-NEXT: v_dual_mov_b32 v48, s19 :: v_dual_mov_b32 v51, s17
; GFX12-NEXT: v_dual_mov_b32 v50, s18 :: v_dual_mov_b32 v53, s15
-; GFX12-NEXT: v_lshrrev_b16 v16, 14, s2
+; GFX12-NEXT: v_lshrrev_b16 v16, 14, s0
; GFX12-NEXT: v_dual_mov_b32 v52, s16 :: v_dual_mov_b32 v55, s13
-; GFX12-NEXT: s_bfe_i32 s13, s2, 0x10015
+; GFX12-NEXT: s_bfe_i32 s13, s0, 0x10015
; GFX12-NEXT: v_mov_b32_e32 v54, s14
-; GFX12-NEXT: v_lshrrev_b16 v0, 12, s2
-; GFX12-NEXT: v_lshrrev_b16 v8, 13, s2
-; GFX12-NEXT: v_lshrrev_b16 v32, 15, s2
-; GFX12-NEXT: v_lshrrev_b16 v12, 4, s4
-; GFX12-NEXT: v_lshrrev_b16 v13, 5, s4
-; GFX12-NEXT: v_lshrrev_b16 v14, 6, s4
-; GFX12-NEXT: v_lshrrev_b16 v15, 7, s4
-; GFX12-NEXT: v_lshrrev_b16 v40, 8, s2
-; GFX12-NEXT: v_lshrrev_b16 v41, 9, s2
-; GFX12-NEXT: v_lshrrev_b16 v42, 10, s2
-; GFX12-NEXT: v_lshrrev_b16 v43, 11, s2
-; GFX12-NEXT: v_lshrrev_b16 v36, 4, s2
-; GFX12-NEXT: v_lshrrev_b16 v37, 5, s2
-; GFX12-NEXT: v_lshrrev_b16 v38, 6, s2
-; GFX12-NEXT: v_lshrrev_b16 v39, 7, s2
-; GFX12-NEXT: v_lshrrev_b16 v33, 1, s2
-; GFX12-NEXT: v_lshrrev_b16 v34, 2, s2
-; GFX12-NEXT: v_lshrrev_b16 v35, 3, s2
-; GFX12-NEXT: v_lshrrev_b16 v9, 1, s4
-; GFX12-NEXT: v_lshrrev_b16 v10, 2, s4
-; GFX12-NEXT: v_lshrrev_b16 v11, 3, s4
-; GFX12-NEXT: s_bfe_i32 s4, s2, 0x10018
-; GFX12-NEXT: s_bfe_i32 s7, s2, 0x10000
-; GFX12-NEXT: s_bfe_i32 s8, s2, 0x10013
-; GFX12-NEXT: s_bfe_i32 s9, s2, 0x10012
-; GFX12-NEXT: s_bfe_i32 s10, s2, 0x10011
-; GFX12-NEXT: s_bfe_i32 s11, s2, 0x10010
-; GFX12-NEXT: s_bfe_i32 s12, s2, 0x10017
-; GFX12-NEXT: s_bfe_i32 s3, s2, 0x10016
-; GFX12-NEXT: s_bfe_i32 s2, s2, 0x10014
+; GFX12-NEXT: v_lshrrev_b16 v0, 12, s0
+; GFX12-NEXT: v_lshrrev_b16 v8, 13, s0
+; GFX12-NEXT: v_lshrrev_b16 v32, 15, s0
+; GFX12-NEXT: v_lshrrev_b16 v12, 4, s2
+; GFX12-NEXT: v_lshrrev_b16 v13, 5, s2
+; GFX12-NEXT: v_lshrrev_b16 v14, 6, s2
+; GFX12-NEXT: v_lshrrev_b16 v15, 7, s2
+; GFX12-NEXT: v_lshrrev_b16 v40, 8, s0
+; GFX12-NEXT: v_lshrrev_b16 v41, 9, s0
+; GFX12-NEXT: v_lshrrev_b16 v42, 10, s0
+; GFX12-NEXT: v_lshrrev_b16 v43, 11, s0
+; GFX12-NEXT: v_lshrrev_b16 v36, 4, s0
+; GFX12-NEXT: v_lshrrev_b16 v37, 5, s0
+; GFX12-NEXT: v_lshrrev_b16 v38, 6, s0
+; GFX12-NEXT: v_lshrrev_b16 v39, 7, s0
+; GFX12-NEXT: v_lshrrev_b16 v33, 1, s0
+; GFX12-NEXT: v_lshrrev_b16 v34, 2, s0
+; GFX12-NEXT: v_lshrrev_b16 v35, 3, s0
+; GFX12-NEXT: v_lshrrev_b16 v9, 1, s2
+; GFX12-NEXT: v_lshrrev_b16 v10, 2, s2
+; GFX12-NEXT: v_lshrrev_b16 v11, 3, s2
+; GFX12-NEXT: s_bfe_i32 s2, s0, 0x10018
+; GFX12-NEXT: s_bfe_i32 s7, s0, 0x10000
+; GFX12-NEXT: s_bfe_i32 s8, s0, 0x10013
+; GFX12-NEXT: s_bfe_i32 s9, s0, 0x10012
+; GFX12-NEXT: s_bfe_i32 s10, s0, 0x10011
+; GFX12-NEXT: s_bfe_i32 s11, s0, 0x10010
+; GFX12-NEXT: s_bfe_i32 s12, s0, 0x10017
+; GFX12-NEXT: s_bfe_i32 s1, s0, 0x10016
+; GFX12-NEXT: s_bfe_i32 s0, s0, 0x10014
; GFX12-NEXT: v_bfe_i32 v23, v23, 0, 1
; GFX12-NEXT: v_bfe_i32 v22, v22, 0, 1
; GFX12-NEXT: v_bfe_i32 v21, v21, 0, 1
@@ -4329,10 +4329,10 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: v_bfe_i32 v29, v29, 0, 1
; GFX12-NEXT: v_bfe_i32 v28, v28, 0, 1
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v56, v[48:51], s[0:1] offset:208
-; GFX12-NEXT: global_store_b128 v56, v[52:55], s[0:1] offset:192
-; GFX12-NEXT: v_dual_mov_b32 v49, s13 :: v_dual_mov_b32 v48, s2
-; GFX12-NEXT: v_dual_mov_b32 v51, s12 :: v_dual_mov_b32 v50, s3
+; GFX12-NEXT: global_store_b128 v56, v[48:51], s[4:5] offset:208
+; GFX12-NEXT: global_store_b128 v56, v[52:55], s[4:5] offset:192
+; GFX12-NEXT: v_dual_mov_b32 v49, s13 :: v_dual_mov_b32 v48, s0
+; GFX12-NEXT: v_dual_mov_b32 v51, s12 :: v_dual_mov_b32 v50, s1
; GFX12-NEXT: v_mov_b32_e32 v53, s10
; GFX12-NEXT: v_bfe_i32 v19, v19, 0, 1
; GFX12-NEXT: v_bfe_i32 v18, v18, 0, 1
@@ -4362,7 +4362,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: v_bfe_i32 v42, v42, 0, 1
; GFX12-NEXT: v_bfe_i32 v41, v41, 0, 1
; GFX12-NEXT: v_bfe_i32 v40, v40, 0, 1
-; GFX12-NEXT: v_mov_b32_e32 v8, s5
+; GFX12-NEXT: v_mov_b32_e32 v8, s3
; GFX12-NEXT: v_bfe_i32 v6, v6, 0, 1
; GFX12-NEXT: v_bfe_i32 v5, v5, 0, 1
; GFX12-NEXT: v_bfe_i32 v4, v4, 0, 1
@@ -4376,22 +4376,22 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: v_bfe_i32 v33, v33, 0, 1
; GFX12-NEXT: v_mov_b32_e32 v32, s7
; GFX12-NEXT: s_clause 0x7
-; GFX12-NEXT: global_store_b128 v56, v[48:51], s[0:1] offset:80
-; GFX12-NEXT: global_store_b128 v56, v[52:55], s[0:1] offset:64
-; GFX12-NEXT: global_store_b128 v56, v[44:47], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v56, v[40:43], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v56, v[36:39], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v56, v[32:35], s[0:1]
-; GFX12-NEXT: global_store_b128 v56, v[28:31], s[0:1] offset:176
-; GFX12-NEXT: global_store_b128 v56, v[24:27], s[0:1] offset:160
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
+; GFX12-NEXT: global_store_b128 v56, v[48:51], s[4:5] offset:80
+; GFX12-NEXT: global_store_b128 v56, v[52:55], s[4:5] offset:64
+; GFX12-NEXT: global_store_b128 v56, v[44:47], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v56, v[40:43], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v56, v[36:39], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v56, v[32:35], s[4:5]
+; GFX12-NEXT: global_store_b128 v56, v[28:31], s[4:5] offset:176
+; GFX12-NEXT: global_store_b128 v56, v[24:27], s[4:5] offset:160
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: s_clause 0x5
-; GFX12-NEXT: global_store_b128 v56, v[20:23], s[0:1] offset:144
-; GFX12-NEXT: global_store_b128 v56, v[16:19], s[0:1] offset:128
-; GFX12-NEXT: global_store_b128 v56, v[12:15], s[0:1] offset:240
-; GFX12-NEXT: global_store_b128 v56, v[8:11], s[0:1] offset:224
-; GFX12-NEXT: global_store_b128 v56, v[4:7], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v56, v[0:3], s[0:1] offset:96
+; GFX12-NEXT: global_store_b128 v56, v[20:23], s[4:5] offset:144
+; GFX12-NEXT: global_store_b128 v56, v[16:19], s[4:5] offset:128
+; GFX12-NEXT: global_store_b128 v56, v[12:15], s[4:5] offset:240
+; GFX12-NEXT: global_store_b128 v56, v[8:11], s[4:5] offset:224
+; GFX12-NEXT: global_store_b128 v56, v[4:7], s[4:5] offset:112
+; GFX12-NEXT: global_store_b128 v56, v[0:3], s[4:5] offset:96
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4423,14 +4423,14 @@ define amdgpu_kernel void @constant_zextload_i1_to_i64(ptr addrspace(1) %out, pt
;
; GFX8-LABEL: constant_zextload_i1_to_i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
@@ -4455,14 +4455,14 @@ define amdgpu_kernel void @constant_zextload_i1_to_i64(ptr addrspace(1) %out, pt
;
; GFX12-LABEL: constant_zextload_i1_to_i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_and_b32 s2, s2, 1
+; GFX12-NEXT: s_and_b32 s0, s0, 1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
+; GFX12-NEXT: global_store_b64 v1, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4494,13 +4494,13 @@ define amdgpu_kernel void @constant_sextload_i1_to_i64(ptr addrspace(1) %out, pt
;
; GFX8-LABEL: constant_sextload_i1_to_i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 1
; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
@@ -4527,14 +4527,14 @@ define amdgpu_kernel void @constant_sextload_i1_to_i64(ptr addrspace(1) %out, pt
;
; GFX12-LABEL: constant_sextload_i1_to_i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX12-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000
; GFX12-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4566,14 +4566,14 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_zextload_v1i1_to_v1i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
@@ -4598,14 +4598,14 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v1i1_to_v1i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_and_b32 s2, s2, 1
+; GFX12-NEXT: s_and_b32 s0, s0, 1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
+; GFX12-NEXT: global_store_b64 v1, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4637,13 +4637,13 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_sextload_v1i1_to_v1i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 1
; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
@@ -4670,14 +4670,14 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v1i1_to_v1i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX12-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000
; GFX12-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4711,14 +4711,14 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_zextload_v2i1_to_v2i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: v_mov_b32_e32 v1, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v3, 1, v0
; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v0
@@ -4749,17 +4749,17 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v2i1_to_v2i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v1, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v1, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v2, 1, v0
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 1, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4794,13 +4794,13 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_sextload_v2i1_to_v2i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v0
; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 1
@@ -4833,10 +4833,10 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v2i1_to_v2i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v4, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v4, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 1, v0
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 1
@@ -4845,7 +4845,7 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out
; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4882,17 +4882,17 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_zextload_v3i1_to_v3i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v5, 0
; GFX8-NEXT: v_mov_b32_e32 v3, v5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX8-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v7, s3
-; GFX8-NEXT: v_mov_b32_e32 v6, s2
+; GFX8-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v7, s1
+; GFX8-NEXT: v_mov_b32_e32 v6, s0
; GFX8-NEXT: v_mov_b32_e32 v1, v5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v0
@@ -4900,10 +4900,10 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out
; GFX8-NEXT: v_and_b32_e32 v8, 1, v0
; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
; GFX8-NEXT: flat_store_dwordx2 v[6:7], v[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff, v8
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
;
@@ -4935,10 +4935,10 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v3i1_to_v3i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v5, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v5, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v5, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_and_b32_e32 v2, 1, v0
; GFX12-NEXT: v_lshrrev_b16 v1, 1, v0
@@ -4950,8 +4950,8 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out
; GFX12-NEXT: v_dual_mov_b32 v3, v5 :: v_dual_and_b32 v4, 0xffff, v3
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v6
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b64 v5, v[4:5], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v5, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b64 v5, v[4:5], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v5, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -4990,17 +4990,17 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_sextload_v3i1_to_v3i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX8-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v7, s3
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: v_mov_b32_e32 v6, s2
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v7, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v6, s0
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v3, 2, v0
; GFX8-NEXT: v_lshrrev_b16_e32 v2, 1, v0
@@ -5045,10 +5045,10 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v3i1_to_v3i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v6, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v6, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v6, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 2, v0
; GFX12-NEXT: v_lshrrev_b16 v2, 1, v0
@@ -5062,8 +5062,8 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v6, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b64 v6, v[4:5], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v6, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5102,21 +5102,21 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_zextload_v4i1_to_v4i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX8-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
-; GFX8-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NEXT: v_mov_b32_e32 v11, s1
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v9, s1
-; GFX8-NEXT: v_mov_b32_e32 v10, s2
+; GFX8-NEXT: v_mov_b32_e32 v9, s5
+; GFX8-NEXT: v_mov_b32_e32 v10, s0
; GFX8-NEXT: v_mov_b32_e32 v5, v1
; GFX8-NEXT: v_mov_b32_e32 v7, v1
-; GFX8-NEXT: v_mov_b32_e32 v8, s0
+; GFX8-NEXT: v_mov_b32_e32 v8, s4
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v4, 2, v0
; GFX8-NEXT: v_lshrrev_b16_e32 v6, 1, v0
@@ -5162,10 +5162,10 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v4i1_to_v4i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v1, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v1, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_and_b32_e32 v6, 1, v0
; GFX12-NEXT: v_lshrrev_b16 v2, 2, v0
@@ -5182,8 +5182,8 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v9
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1]
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v1, v[4:7], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5225,17 +5225,17 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_sextload_v4i1_to_v4i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX8-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v11, s3
-; GFX8-NEXT: v_mov_b32_e32 v9, s1
-; GFX8-NEXT: v_mov_b32_e32 v10, s2
-; GFX8-NEXT: v_mov_b32_e32 v8, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v11, s1
+; GFX8-NEXT: v_mov_b32_e32 v9, s5
+; GFX8-NEXT: v_mov_b32_e32 v10, s0
+; GFX8-NEXT: v_mov_b32_e32 v8, s4
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v3, 2, v0
; GFX8-NEXT: v_lshrrev_b16_e32 v4, 3, v0
@@ -5286,10 +5286,10 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v4i1_to_v4i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v8, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v8, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v8, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 3, v0
; GFX12-NEXT: v_lshrrev_b16 v2, 2, v0
@@ -5307,8 +5307,8 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5357,24 +5357,24 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_zextload_v8i1_to_v8i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX8-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v19, s3
-; GFX8-NEXT: v_mov_b32_e32 v18, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 32
-; GFX8-NEXT: v_mov_b32_e32 v17, s1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v16, s0
-; GFX8-NEXT: s_add_u32 s0, s0, 16
+; GFX8-NEXT: s_add_u32 s0, s4, 48
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v19, s1
+; GFX8-NEXT: v_mov_b32_e32 v18, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 32
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v21, s1
+; GFX8-NEXT: v_mov_b32_e32 v20, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
; GFX8-NEXT: v_mov_b32_e32 v1, 0
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v21, s3
+; GFX8-NEXT: v_mov_b32_e32 v17, s5
; GFX8-NEXT: v_mov_b32_e32 v23, s1
; GFX8-NEXT: v_mov_b32_e32 v5, v1
; GFX8-NEXT: v_mov_b32_e32 v7, v1
@@ -5382,7 +5382,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out
; GFX8-NEXT: v_mov_b32_e32 v11, v1
; GFX8-NEXT: v_mov_b32_e32 v13, v1
; GFX8-NEXT: v_mov_b32_e32 v15, v1
-; GFX8-NEXT: v_mov_b32_e32 v20, s2
+; GFX8-NEXT: v_mov_b32_e32 v16, s4
; GFX8-NEXT: v_mov_b32_e32 v22, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v2, 2, v0
@@ -5457,10 +5457,10 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v8i1_to_v8i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v1, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v1, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_and_b32_e32 v12, 1, v0
; GFX12-NEXT: v_lshrrev_b16 v4, 5, v0
@@ -5482,10 +5482,10 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out
; GFX12-NEXT: v_and_b32_e32 v10, 0xffff, v18
; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v14
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1]
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v1, v[4:7], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v1, v[8:11], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v1, v[12:15], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5541,24 +5541,24 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out
;
; GFX8-LABEL: constant_sextload_v8i1_to_v8i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX8-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v19, s3
-; GFX8-NEXT: v_mov_b32_e32 v18, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 32
-; GFX8-NEXT: v_mov_b32_e32 v17, s1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v16, s0
-; GFX8-NEXT: s_add_u32 s0, s0, 16
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v21, s3
+; GFX8-NEXT: s_add_u32 s0, s4, 48
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v19, s1
+; GFX8-NEXT: v_mov_b32_e32 v18, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 32
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v21, s1
+; GFX8-NEXT: v_mov_b32_e32 v20, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v17, s5
; GFX8-NEXT: v_mov_b32_e32 v23, s1
-; GFX8-NEXT: v_mov_b32_e32 v20, s2
+; GFX8-NEXT: v_mov_b32_e32 v16, s4
; GFX8-NEXT: v_mov_b32_e32 v22, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v3, 6, v0
@@ -5646,10 +5646,10 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v8i1_to_v8i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v16, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v1, v16, s[2:3]
+; GFX12-NEXT: global_load_u8 v1, v16, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v3, 6, v1
; GFX12-NEXT: v_lshrrev_b16 v5, 7, v1
@@ -5675,10 +5675,10 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out
; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5750,21 +5750,21 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
;
; GFX8-LABEL: constant_zextload_v16i1_to_v16i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: v_mov_b32_e32 v6, v2
; GFX8-NEXT: v_mov_b32_e32 v8, v2
; GFX8-NEXT: v_mov_b32_e32 v4, v2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
-; GFX8-NEXT: s_add_u32 s2, s0, 0x70
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: s_add_u32 s4, s0, 0x50
-; GFX8-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v23, s5
-; GFX8-NEXT: v_mov_b32_e32 v22, s4
+; GFX8-NEXT: s_add_u32 s0, s4, 0x70
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: s_add_u32 s2, s4, 0x50
+; GFX8-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v23, s3
+; GFX8-NEXT: v_mov_b32_e32 v22, s2
; GFX8-NEXT: v_mov_b32_e32 v9, v2
; GFX8-NEXT: v_mov_b32_e32 v11, v2
; GFX8-NEXT: v_mov_b32_e32 v12, v2
@@ -5781,49 +5781,49 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
; GFX8-NEXT: v_and_b32_e32 v7, 0xffff, v1
; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[5:8]
-; GFX8-NEXT: v_mov_b32_e32 v23, s3
+; GFX8-NEXT: v_mov_b32_e32 v23, s1
+; GFX8-NEXT: v_mov_b32_e32 v22, s0
; GFX8-NEXT: v_lshrrev_b16_e32 v1, 14, v0
-; GFX8-NEXT: v_mov_b32_e32 v22, s2
+; GFX8-NEXT: s_add_u32 s0, s4, 64
; GFX8-NEXT: v_lshrrev_b16_e32 v3, 15, v0
; GFX8-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX8-NEXT: s_add_u32 s2, s0, 64
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v5, v2
; GFX8-NEXT: v_mov_b32_e32 v7, v2
; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[1:4]
; GFX8-NEXT: v_mov_b32_e32 v23, v2
; GFX8-NEXT: v_mov_b32_e32 v3, 1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v1, s2
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: v_and_b32_sdwa v8, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX8-NEXT: v_lshrrev_b16_e32 v3, 9, v0
-; GFX8-NEXT: s_add_u32 s2, s0, 0x60
+; GFX8-NEXT: s_add_u32 s0, s4, 0x60
; GFX8-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_and_b32_e32 v10, 0xffff, v3
-; GFX8-NEXT: v_mov_b32_e32 v4, s3
-; GFX8-NEXT: v_mov_b32_e32 v3, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v4, s1
+; GFX8-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 48
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_lshrrev_b16_e32 v6, 12, v0
; GFX8-NEXT: flat_store_dwordx4 v[1:2], v[8:11]
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: v_mov_b32_e32 v9, s3
+; GFX8-NEXT: v_lshrrev_b16_e32 v16, 5, v0
+; GFX8-NEXT: v_mov_b32_e32 v9, s1
; GFX8-NEXT: v_and_b32_e32 v11, 1, v6
; GFX8-NEXT: v_lshrrev_b16_e32 v6, 13, v0
-; GFX8-NEXT: v_mov_b32_e32 v8, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 32
+; GFX8-NEXT: v_mov_b32_e32 v8, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 32
; GFX8-NEXT: v_and_b32_e32 v6, 1, v6
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_and_b32_e32 v13, 0xffff, v6
-; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: flat_store_dwordx4 v[3:4], v[11:14]
-; GFX8-NEXT: s_add_u32 s0, s0, 16
; GFX8-NEXT: v_lshrrev_b16_e32 v3, 7, v0
+; GFX8-NEXT: v_mov_b32_e32 v11, s1
+; GFX8-NEXT: v_mov_b32_e32 v10, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
; GFX8-NEXT: v_lshrrev_b16_e32 v6, 6, v0
; GFX8-NEXT: v_lshrrev_b16_e32 v14, 4, v0
-; GFX8-NEXT: v_lshrrev_b16_e32 v16, 5, v0
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_lshrrev_b16_e32 v4, 2, v0
; GFX8-NEXT: v_and_b32_e32 v18, 1, v14
; GFX8-NEXT: v_and_b32_e32 v14, 1, v6
@@ -5832,13 +5832,13 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
; GFX8-NEXT: v_lshrrev_b16_e32 v0, 1, v0
; GFX8-NEXT: v_and_b32_e32 v3, 1, v3
; GFX8-NEXT: v_and_b32_e32 v16, 1, v16
-; GFX8-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: v_mov_b32_e32 v13, s1
; GFX8-NEXT: v_and_b32_e32 v6, 1, v6
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: v_and_b32_e32 v20, 0xffff, v16
; GFX8-NEXT: v_and_b32_e32 v16, 0xffff, v3
-; GFX8-NEXT: v_mov_b32_e32 v10, s2
+; GFX8-NEXT: v_mov_b32_e32 v2, s5
; GFX8-NEXT: v_mov_b32_e32 v12, s0
; GFX8-NEXT: v_and_b32_e32 v4, 1, v4
; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v0
@@ -5934,10 +5934,10 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_zextload_v16i1_to_v16i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v0, v1, s[2:3]
+; GFX12-NEXT: global_load_u16 v0, v1, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_and_b32_e32 v28, 1, v0
; GFX12-NEXT: v_lshrrev_b16 v4, 11, v0
@@ -5981,14 +5981,14 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v26, 0xffff, v38
; GFX12-NEXT: v_and_b32_e32 v22, 0xffff, v37
; GFX12-NEXT: s_clause 0x7
-; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:80
-; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:64
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1] offset:96
-; GFX12-NEXT: global_store_b128 v1, v[16:19], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v1, v[20:23], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v1, v[24:27], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v1, v[28:31], s[0:1]
+; GFX12-NEXT: global_store_b128 v1, v[4:7], s[4:5] offset:80
+; GFX12-NEXT: global_store_b128 v1, v[8:11], s[4:5] offset:64
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:112
+; GFX12-NEXT: global_store_b128 v1, v[12:15], s[4:5] offset:96
+; GFX12-NEXT: global_store_b128 v1, v[16:19], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v1, v[20:23], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v1, v[24:27], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v1, v[28:31], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6073,40 +6073,40 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o
;
; GFX8-LABEL: constant_sextload_v16i1_to_v16i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
-; GFX8-NEXT: s_add_u32 s2, s0, 0x70
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v6, s3
-; GFX8-NEXT: v_mov_b32_e32 v5, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 0x60
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v8, s3
-; GFX8-NEXT: v_mov_b32_e32 v7, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 0x50
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v10, s3
-; GFX8-NEXT: v_mov_b32_e32 v9, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 64
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v16, s3
-; GFX8-NEXT: v_mov_b32_e32 v15, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v23, s3
-; GFX8-NEXT: v_mov_b32_e32 v22, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 32
-; GFX8-NEXT: v_mov_b32_e32 v21, s1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v20, s0
-; GFX8-NEXT: s_add_u32 s0, s0, 16
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v25, s3
+; GFX8-NEXT: s_add_u32 s0, s4, 0x70
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s1
+; GFX8-NEXT: v_mov_b32_e32 v5, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0x60
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v8, s1
+; GFX8-NEXT: v_mov_b32_e32 v7, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0x50
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v10, s1
+; GFX8-NEXT: v_mov_b32_e32 v9, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 64
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v16, s1
+; GFX8-NEXT: v_mov_b32_e32 v15, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 48
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v23, s1
+; GFX8-NEXT: v_mov_b32_e32 v22, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 32
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v25, s1
+; GFX8-NEXT: v_mov_b32_e32 v24, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v21, s5
; GFX8-NEXT: v_mov_b32_e32 v27, s1
-; GFX8-NEXT: v_mov_b32_e32 v24, s2
+; GFX8-NEXT: v_mov_b32_e32 v20, s4
; GFX8-NEXT: v_mov_b32_e32 v26, s0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshrrev_b16_e32 v1, 14, v0
@@ -6267,10 +6267,10 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_sextload_v16i1_to_v16i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v32, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v1, v32, s[2:3]
+; GFX12-NEXT: global_load_u16 v1, v32, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v3, 14, v1
; GFX12-NEXT: v_lshrrev_b16 v5, 15, v1
@@ -6320,14 +6320,14 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o
; GFX12-NEXT: v_ashrrev_i32_e32 v11, 31, v10
; GFX12-NEXT: v_ashrrev_i32_e32 v9, 31, v8
; GFX12-NEXT: s_clause 0x7
-; GFX12-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:96
-; GFX12-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:80
-; GFX12-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:64
-; GFX12-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v32, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v32, v[28:31], s[4:5] offset:112
+; GFX12-NEXT: global_store_b128 v32, v[24:27], s[4:5] offset:96
+; GFX12-NEXT: global_store_b128 v32, v[20:23], s[4:5] offset:80
+; GFX12-NEXT: global_store_b128 v32, v[16:19], s[4:5] offset:64
+; GFX12-NEXT: global_store_b128 v32, v[12:15], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v32, v[8:11], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v32, v[4:7], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v32, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6447,86 +6447,86 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
;
; GFX8-LABEL: constant_zextload_v32i1_to_v32i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX8-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 13, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 13, s0
; GFX8-NEXT: v_and_b32_e32 v15, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 9, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v2, 11, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 9, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v2, 11, s0
; GFX8-NEXT: v_and_b32_e32 v8, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 7, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 7, s0
; GFX8-NEXT: v_and_b32_e32 v11, 1, v2
; GFX8-NEXT: v_and_b32_e32 v2, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s2
-; GFX8-NEXT: s_lshr_b32 s14, s2, 24
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s0
+; GFX8-NEXT: s_lshr_b32 s14, s0, 24
; GFX8-NEXT: v_and_b32_e32 v5, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s2
-; GFX8-NEXT: s_bfe_u32 s10, s2, 0x10018
-; GFX8-NEXT: s_and_b32 s11, s2, 1
-; GFX8-NEXT: s_bfe_u32 s15, s2, 0x10011
-; GFX8-NEXT: s_bfe_u32 s16, s2, 0x10010
-; GFX8-NEXT: s_bfe_u32 s17, s2, 0x10012
-; GFX8-NEXT: s_bfe_u32 s18, s2, 0x10013
-; GFX8-NEXT: s_bfe_u32 s19, s2, 0x10014
-; GFX8-NEXT: s_bfe_u32 s20, s2, 0x10015
-; GFX8-NEXT: s_bfe_u32 s21, s2, 0x10016
-; GFX8-NEXT: s_bfe_u32 s22, s2, 0x10017
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v4, 14, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v14, 12, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v9, 10, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v3, 6, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v7, 4, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s0
+; GFX8-NEXT: s_bfe_u32 s10, s0, 0x10018
+; GFX8-NEXT: s_and_b32 s11, s0, 1
+; GFX8-NEXT: s_bfe_u32 s15, s0, 0x10011
+; GFX8-NEXT: s_bfe_u32 s16, s0, 0x10010
+; GFX8-NEXT: s_bfe_u32 s17, s0, 0x10012
+; GFX8-NEXT: s_bfe_u32 s18, s0, 0x10013
+; GFX8-NEXT: s_bfe_u32 s19, s0, 0x10014
+; GFX8-NEXT: s_bfe_u32 s20, s0, 0x10015
+; GFX8-NEXT: s_bfe_u32 s21, s0, 0x10016
+; GFX8-NEXT: s_bfe_u32 s22, s0, 0x10017
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v4, 14, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v14, 12, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v9, 10, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v3, 6, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v7, 4, s0
; GFX8-NEXT: v_and_b32_e32 v10, 1, v1
-; GFX8-NEXT: v_lshrrev_b16_e64 v12, 2, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 1, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v20, 15, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 0xb0
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: s_add_u32 s4, s0, 0xa0
-; GFX8-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NEXT: s_add_u32 s6, s0, 0x90
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
-; GFX8-NEXT: s_add_u32 s8, s0, 0x80
-; GFX8-NEXT: s_addc_u32 s9, s1, 0
-; GFX8-NEXT: s_add_u32 s12, s0, 0x70
+; GFX8-NEXT: v_lshrrev_b16_e64 v12, 2, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 1, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v20, 15, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0xb0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: s_add_u32 s2, s4, 0xa0
+; GFX8-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NEXT: s_add_u32 s6, s4, 0x90
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
+; GFX8-NEXT: s_add_u32 s8, s4, 0x80
+; GFX8-NEXT: s_addc_u32 s9, s5, 0
+; GFX8-NEXT: s_add_u32 s12, s4, 0x70
; GFX8-NEXT: v_and_b32_e32 v16, 1, v1
; GFX8-NEXT: v_lshrrev_b16_e64 v1, 5, s14
-; GFX8-NEXT: s_addc_u32 s13, s1, 0
+; GFX8-NEXT: s_addc_u32 s13, s5, 0
; GFX8-NEXT: v_and_b32_e32 v17, 1, v1
; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s14
; GFX8-NEXT: v_mov_b32_e32 v23, s13
; GFX8-NEXT: v_and_b32_e32 v25, 1, v1
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_mov_b32_e32 v22, s12
-; GFX8-NEXT: s_add_u32 s12, s0, 0xf0
+; GFX8-NEXT: s_add_u32 s12, s4, 0xf0
; GFX8-NEXT: v_and_b32_e32 v18, 1, v4
; GFX8-NEXT: v_mov_b32_e32 v19, v1
; GFX8-NEXT: v_mov_b32_e32 v21, v1
-; GFX8-NEXT: s_addc_u32 s13, s1, 0
+; GFX8-NEXT: s_addc_u32 s13, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[18:21]
; GFX8-NEXT: v_mov_b32_e32 v23, s13
; GFX8-NEXT: v_lshrrev_b16_e64 v6, 6, s14
; GFX8-NEXT: v_mov_b32_e32 v22, s12
-; GFX8-NEXT: s_add_u32 s12, s0, 0x60
+; GFX8-NEXT: s_add_u32 s12, s4, 0x60
; GFX8-NEXT: v_and_b32_e32 v18, 1, v6
; GFX8-NEXT: v_lshrrev_b16_e64 v20, 7, s14
-; GFX8-NEXT: s_addc_u32 s13, s1, 0
+; GFX8-NEXT: s_addc_u32 s13, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[18:21]
; GFX8-NEXT: v_lshrrev_b16_e64 v24, 4, s14
; GFX8-NEXT: v_and_b32_e32 v18, 1, v14
; GFX8-NEXT: v_and_b32_e32 v20, 0xffff, v15
; GFX8-NEXT: v_mov_b32_e32 v15, s13
; GFX8-NEXT: v_mov_b32_e32 v14, s12
-; GFX8-NEXT: s_add_u32 s12, s0, 0x50
-; GFX8-NEXT: s_addc_u32 s13, s1, 0
+; GFX8-NEXT: s_add_u32 s12, s4, 0x50
+; GFX8-NEXT: s_addc_u32 s13, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v23, s13
; GFX8-NEXT: v_mov_b32_e32 v22, s12
-; GFX8-NEXT: s_add_u32 s12, s0, 64
+; GFX8-NEXT: s_add_u32 s12, s4, 64
; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[18:21]
-; GFX8-NEXT: s_addc_u32 s13, s1, 0
+; GFX8-NEXT: s_addc_u32 s13, s5, 0
; GFX8-NEXT: v_and_b32_e32 v18, 1, v9
; GFX8-NEXT: v_and_b32_e32 v20, 0xffff, v11
; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[18:21]
@@ -6539,17 +6539,17 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[18:21]
; GFX8-NEXT: v_and_b32_e32 v23, 0xffff, v2
; GFX8-NEXT: v_and_b32_e32 v18, 1, v7
-; GFX8-NEXT: v_mov_b32_e32 v8, s3
+; GFX8-NEXT: v_mov_b32_e32 v8, s1
; GFX8-NEXT: v_and_b32_e32 v21, 1, v3
; GFX8-NEXT: v_mov_b32_e32 v0, s21
; GFX8-NEXT: v_mov_b32_e32 v2, s22
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v7, s2
+; GFX8-NEXT: v_mov_b32_e32 v7, s0
; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v8, s5
+; GFX8-NEXT: v_mov_b32_e32 v8, s3
; GFX8-NEXT: v_mov_b32_e32 v0, s19
; GFX8-NEXT: v_mov_b32_e32 v2, s20
-; GFX8-NEXT: v_mov_b32_e32 v7, s4
+; GFX8-NEXT: v_mov_b32_e32 v7, s2
; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v8, s7
; GFX8-NEXT: v_mov_b32_e32 v0, s17
@@ -6557,62 +6557,62 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v7, s6
; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v7, s8
+; GFX8-NEXT: s_add_u32 s0, s4, 48
; GFX8-NEXT: v_mov_b32_e32 v0, s16
; GFX8-NEXT: v_mov_b32_e32 v2, s15
; GFX8-NEXT: v_mov_b32_e32 v8, s9
-; GFX8-NEXT: s_add_u32 s2, s0, 48
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_and_b32_e32 v15, 1, v24
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 32
; GFX8-NEXT: v_mov_b32_e32 v22, v1
; GFX8-NEXT: v_mov_b32_e32 v24, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 32
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[21:24]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 16
; GFX8-NEXT: v_and_b32_e32 v20, 0xffff, v5
; GFX8-NEXT: v_mov_b32_e32 v21, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 16
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[18:21]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_and_b32_e32 v14, 0xffff, v16
; GFX8-NEXT: v_and_b32_e32 v9, 1, v12
; GFX8-NEXT: v_mov_b32_e32 v10, v1
; GFX8-NEXT: v_mov_b32_e32 v12, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: v_mov_b32_e32 v8, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v8, s5
+; GFX8-NEXT: s_add_u32 s0, s4, 0xe0
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[9:12]
; GFX8-NEXT: v_mov_b32_e32 v0, s11
; GFX8-NEXT: v_mov_b32_e32 v2, v14
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v7, s0
-; GFX8-NEXT: s_add_u32 s2, s0, 0xe0
+; GFX8-NEXT: v_mov_b32_e32 v7, s4
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0xd0
-; GFX8-NEXT: v_lshrrev_b16_e64 v4, 1, s14
; GFX8-NEXT: v_and_b32_e32 v17, 0xffff, v17
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0xd0
; GFX8-NEXT: v_mov_b32_e32 v16, v1
; GFX8-NEXT: v_mov_b32_e32 v18, v1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_lshrrev_b16_e64 v4, 1, s14
+; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[15:18]
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_lshrrev_b16_e64 v26, 2, s14
; GFX8-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[15:18]
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: s_add_u32 s0, s0, 0xc0
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0xc0
; GFX8-NEXT: v_and_b32_e32 v13, 0xffff, v4
; GFX8-NEXT: v_and_b32_e32 v4, 1, v26
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v25
; GFX8-NEXT: v_mov_b32_e32 v5, v1
; GFX8-NEXT: v_mov_b32_e32 v7, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[4:7]
; GFX8-NEXT: v_mov_b32_e32 v0, s10
; GFX8-NEXT: v_mov_b32_e32 v5, s1
@@ -6783,71 +6783,71 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_zextload_v32i1_to_v32i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v0, 13, s2
-; GFX12-NEXT: v_lshrrev_b16 v3, 11, s2
-; GFX12-NEXT: s_lshr_b32 s3, s2, 24
+; GFX12-NEXT: v_lshrrev_b16 v0, 13, s0
+; GFX12-NEXT: v_lshrrev_b16 v3, 11, s0
+; GFX12-NEXT: s_lshr_b32 s1, s0, 24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-NEXT: v_lshrrev_b16 v2, 12, s2
+; GFX12-NEXT: v_lshrrev_b16 v2, 12, s0
; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX12-NEXT: v_lshrrev_b16 v4, 9, s2
-; GFX12-NEXT: v_lshrrev_b16 v8, 7, s2
-; GFX12-NEXT: v_lshrrev_b16 v16, 7, s3
-; GFX12-NEXT: v_lshrrev_b16 v18, 6, s3
-; GFX12-NEXT: v_lshrrev_b16 v17, 5, s3
-; GFX12-NEXT: v_lshrrev_b16 v20, 4, s3
-; GFX12-NEXT: v_lshrrev_b16 v21, 3, s3
-; GFX12-NEXT: v_lshrrev_b16 v22, 2, s3
-; GFX12-NEXT: v_lshrrev_b16 v23, 1, s3
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10016
+; GFX12-NEXT: v_lshrrev_b16 v4, 9, s0
+; GFX12-NEXT: v_lshrrev_b16 v8, 7, s0
+; GFX12-NEXT: v_lshrrev_b16 v16, 7, s1
+; GFX12-NEXT: v_lshrrev_b16 v18, 6, s1
+; GFX12-NEXT: v_lshrrev_b16 v17, 5, s1
+; GFX12-NEXT: v_lshrrev_b16 v20, 4, s1
+; GFX12-NEXT: v_lshrrev_b16 v21, 3, s1
+; GFX12-NEXT: v_lshrrev_b16 v22, 2, s1
+; GFX12-NEXT: v_lshrrev_b16 v23, 1, s1
+; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10016
; GFX12-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10017
-; GFX12-NEXT: v_lshrrev_b16 v11, 5, s2
-; GFX12-NEXT: v_lshrrev_b16 v13, 3, s2
+; GFX12-NEXT: s_bfe_u32 s6, s0, 0x10017
+; GFX12-NEXT: v_lshrrev_b16 v11, 5, s0
+; GFX12-NEXT: v_lshrrev_b16 v13, 3, s0
; GFX12-NEXT: v_and_b32_e32 v24, 1, v4
; GFX12-NEXT: v_and_b32_e32 v25, 1, v8
; GFX12-NEXT: v_and_b32_e32 v28, 1, v21
; GFX12-NEXT: v_dual_mov_b32 v42, v1 :: v_dual_and_b32 v31, 1, v2
; GFX12-NEXT: v_dual_mov_b32 v32, v1 :: v_dual_and_b32 v33, 0xffff, v0
-; GFX12-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_and_b32 v21, 0xffff, v3
+; GFX12-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_and_b32 v21, 0xffff, v3
; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10014
-; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10015
-; GFX12-NEXT: v_lshrrev_b16 v9, 8, s2
-; GFX12-NEXT: v_lshrrev_b16 v15, 1, s2
+; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10014
+; GFX12-NEXT: s_bfe_u32 s6, s0, 0x10015
+; GFX12-NEXT: v_lshrrev_b16 v9, 8, s0
+; GFX12-NEXT: v_lshrrev_b16 v15, 1, s0
; GFX12-NEXT: v_and_b32_e32 v11, 1, v11
; GFX12-NEXT: v_and_b32_e32 v13, 1, v13
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:176
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:176
; GFX12-NEXT: v_mov_b32_e32 v2, s6
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10012
-; GFX12-NEXT: s_bfe_u32 s6, s2, 0x10013
-; GFX12-NEXT: v_lshrrev_b16 v6, 10, s2
+; GFX12-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10012
+; GFX12-NEXT: s_bfe_u32 s6, s0, 0x10013
+; GFX12-NEXT: v_lshrrev_b16 v6, 10, s0
; GFX12-NEXT: v_and_b32_e32 v26, 1, v15
; GFX12-NEXT: v_dual_mov_b32 v36, v1 :: v_dual_and_b32 v15, 1, v9
; GFX12-NEXT: v_and_b32_e32 v9, 1, v17
; GFX12-NEXT: v_and_b32_e32 v29, 1, v23
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:160
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:160
; GFX12-NEXT: v_mov_b32_e32 v2, s6
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: v_lshrrev_b16 v5, 15, s2
-; GFX12-NEXT: v_lshrrev_b16 v7, 14, s2
-; GFX12-NEXT: v_lshrrev_b16 v10, 6, s2
-; GFX12-NEXT: v_lshrrev_b16 v12, 4, s2
-; GFX12-NEXT: v_lshrrev_b16 v14, 2, s2
-; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10018
-; GFX12-NEXT: s_and_b32 s5, s2, 1
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10011
-; GFX12-NEXT: s_bfe_u32 s2, s2, 0x10010
+; GFX12-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-NEXT: v_lshrrev_b16 v5, 15, s0
+; GFX12-NEXT: v_lshrrev_b16 v7, 14, s0
+; GFX12-NEXT: v_lshrrev_b16 v10, 6, s0
+; GFX12-NEXT: v_lshrrev_b16 v12, 4, s0
+; GFX12-NEXT: v_lshrrev_b16 v14, 2, s0
+; GFX12-NEXT: s_bfe_u32 s2, s0, 0x10018
+; GFX12-NEXT: s_and_b32 s3, s0, 1
+; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10011
+; GFX12-NEXT: s_bfe_u32 s0, s0, 0x10010
; GFX12-NEXT: v_dual_mov_b32 v38, v1 :: v_dual_and_b32 v19, 1, v6
; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v13
; GFX12-NEXT: v_and_b32_e32 v17, 0xffff, v24
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:144
-; GFX12-NEXT: v_mov_b32_e32 v2, s3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v13, v1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:144
+; GFX12-NEXT: v_mov_b32_e32 v2, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v13, v1
; GFX12-NEXT: v_and_b32_e32 v43, 0xffff, v26
; GFX12-NEXT: v_and_b32_e32 v4, 1, v14
; GFX12-NEXT: v_and_b32_e32 v8, 1, v12
@@ -6856,9 +6856,9 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v37, 0xffff, v16
; GFX12-NEXT: v_and_b32_e32 v39, 1, v7
; GFX12-NEXT: v_dual_mov_b32 v16, v1 :: v_dual_and_b32 v41, 0xffff, v5
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:128
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:128
; GFX12-NEXT: v_mov_b32_e32 v5, v1
-; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v0, s5
+; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v0, s3
; GFX12-NEXT: v_dual_mov_b32 v2, v43 :: v_dual_and_b32 v29, 0xffff, v9
; GFX12-NEXT: v_dual_mov_b32 v40, v1 :: v_dual_and_b32 v23, 1, v22
; GFX12-NEXT: v_dual_mov_b32 v30, v1 :: v_dual_and_b32 v27, 1, v20
@@ -6870,26 +6870,26 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v25
; GFX12-NEXT: v_mov_b32_e32 v24, v1
; GFX12-NEXT: s_clause 0x4
-; GFX12-NEXT: global_store_b128 v1, v[35:38], s[0:1] offset:240
-; GFX12-NEXT: global_store_b128 v1, v[39:42], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v1, v[31:34], s[0:1] offset:96
-; GFX12-NEXT: global_store_b128 v1, v[19:22], s[0:1] offset:80
-; GFX12-NEXT: global_store_b128 v1, v[15:18], s[0:1] offset:64
+; GFX12-NEXT: global_store_b128 v1, v[35:38], s[4:5] offset:240
+; GFX12-NEXT: global_store_b128 v1, v[39:42], s[4:5] offset:112
+; GFX12-NEXT: global_store_b128 v1, v[31:34], s[4:5] offset:96
+; GFX12-NEXT: global_store_b128 v1, v[19:22], s[4:5] offset:80
+; GFX12-NEXT: global_store_b128 v1, v[15:18], s[4:5] offset:64
; GFX12-NEXT: v_mov_b32_e32 v15, v1
; GFX12-NEXT: v_mov_b32_e32 v11, v1
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
+; GFX12-NEXT: global_store_b128 v1, v[4:7], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5]
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: v_dual_mov_b32 v2, v44 :: v_dual_mov_b32 v9, v1
; GFX12-NEXT: v_dual_mov_b32 v26, v1 :: v_dual_and_b32 v25, 0xffff, v28
; GFX12-NEXT: v_mov_b32_e32 v28, v1
; GFX12-NEXT: s_clause 0x4
-; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v1, v[27:30], s[0:1] offset:224
-; GFX12-NEXT: global_store_b128 v1, v[23:26], s[0:1] offset:208
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:192
+; GFX12-NEXT: global_store_b128 v1, v[12:15], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v1, v[8:11], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v1, v[27:30], s[4:5] offset:224
+; GFX12-NEXT: global_store_b128 v1, v[23:26], s[4:5] offset:208
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:192
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -7067,43 +7067,43 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
;
; GFX8-LABEL: constant_sextload_v32i1_to_v32i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dword s4, s[2:3], 0x0
+; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_lshr_b32 s6, s4, 22
-; GFX8-NEXT: s_lshr_b32 s8, s4, 23
-; GFX8-NEXT: s_lshr_b32 s10, s4, 20
-; GFX8-NEXT: s_lshr_b32 s12, s4, 21
-; GFX8-NEXT: s_lshr_b32 s14, s4, 18
-; GFX8-NEXT: s_lshr_b32 s16, s4, 19
-; GFX8-NEXT: s_lshr_b32 s18, s4, 16
-; GFX8-NEXT: s_lshr_b32 s20, s4, 17
-; GFX8-NEXT: s_lshr_b32 s2, s4, 24
-; GFX8-NEXT: v_lshrrev_b16_e64 v2, 14, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v3, 15, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v15, 12, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v17, 13, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v13, 10, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v14, 11, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v9, 8, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v10, 9, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v6, 6, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v8, 7, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v4, 4, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v5, 5, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 2, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v7, 1, s4
-; GFX8-NEXT: v_lshrrev_b16_e64 v11, 6, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v12, 7, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v16, 4, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v18, 5, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v19, 2, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v20, 3, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v27, 1, s2
+; GFX8-NEXT: s_lshr_b32 s6, s2, 22
+; GFX8-NEXT: s_lshr_b32 s8, s2, 23
+; GFX8-NEXT: s_lshr_b32 s10, s2, 20
+; GFX8-NEXT: s_lshr_b32 s12, s2, 21
+; GFX8-NEXT: s_lshr_b32 s14, s2, 18
+; GFX8-NEXT: s_lshr_b32 s16, s2, 19
+; GFX8-NEXT: s_lshr_b32 s18, s2, 16
+; GFX8-NEXT: s_lshr_b32 s20, s2, 17
+; GFX8-NEXT: s_lshr_b32 s0, s2, 24
+; GFX8-NEXT: v_lshrrev_b16_e64 v2, 14, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v3, 15, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v15, 12, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v17, 13, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v13, 10, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v14, 11, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v9, 8, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v10, 9, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v6, 6, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v8, 7, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v4, 4, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v5, 5, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 2, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 3, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v7, 1, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v11, 6, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v12, 7, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v16, 4, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v18, 5, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v19, 2, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v20, 3, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v27, 1, s0
+; GFX8-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000
; GFX8-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
-; GFX8-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
; GFX8-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000
; GFX8-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000
; GFX8-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000
@@ -7113,33 +7113,33 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX8-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000
; GFX8-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
; GFX8-NEXT: v_mov_b32_e32 v21, s6
-; GFX8-NEXT: s_add_u32 s6, s0, 0xb0
+; GFX8-NEXT: s_add_u32 s6, s4, 0xb0
; GFX8-NEXT: v_mov_b32_e32 v22, s7
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v26, s7
; GFX8-NEXT: v_mov_b32_e32 v25, s6
-; GFX8-NEXT: s_add_u32 s6, s0, 0xa0
+; GFX8-NEXT: s_add_u32 s6, s4, 0xa0
; GFX8-NEXT: v_mov_b32_e32 v23, s8
; GFX8-NEXT: v_mov_b32_e32 v24, s9
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[25:26], v[21:24]
; GFX8-NEXT: v_mov_b32_e32 v26, s7
; GFX8-NEXT: v_mov_b32_e32 v25, s6
-; GFX8-NEXT: s_add_u32 s6, s0, 0x90
+; GFX8-NEXT: s_add_u32 s6, s4, 0x90
; GFX8-NEXT: v_mov_b32_e32 v21, s10
; GFX8-NEXT: v_mov_b32_e32 v22, s11
; GFX8-NEXT: v_mov_b32_e32 v23, s12
; GFX8-NEXT: v_mov_b32_e32 v24, s13
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[25:26], v[21:24]
; GFX8-NEXT: v_mov_b32_e32 v26, s7
; GFX8-NEXT: v_mov_b32_e32 v25, s6
-; GFX8-NEXT: s_add_u32 s6, s0, 0x80
+; GFX8-NEXT: s_add_u32 s6, s4, 0x80
; GFX8-NEXT: v_mov_b32_e32 v21, s14
; GFX8-NEXT: v_mov_b32_e32 v22, s15
; GFX8-NEXT: v_mov_b32_e32 v23, s16
; GFX8-NEXT: v_mov_b32_e32 v24, s17
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[25:26], v[21:24]
; GFX8-NEXT: v_mov_b32_e32 v26, s7
; GFX8-NEXT: v_mov_b32_e32 v21, s18
@@ -7147,15 +7147,15 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v23, s20
; GFX8-NEXT: v_mov_b32_e32 v24, s21
; GFX8-NEXT: v_mov_b32_e32 v25, s6
-; GFX8-NEXT: s_add_u32 s6, s0, 0x70
+; GFX8-NEXT: s_add_u32 s6, s4, 0x70
; GFX8-NEXT: flat_store_dwordx4 v[25:26], v[21:24]
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: v_bfe_i32 v23, v3, 0, 1
; GFX8-NEXT: v_bfe_i32 v21, v2, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: v_mov_b32_e32 v3, s7
-; GFX8-NEXT: s_add_u32 s6, s0, 0x60
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_add_u32 s6, s4, 0x60
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: v_ashrrev_i32_e32 v24, 31, v23
; GFX8-NEXT: v_ashrrev_i32_e32 v22, 31, v21
; GFX8-NEXT: v_mov_b32_e32 v26, s7
@@ -7163,29 +7163,29 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v25, s6
; GFX8-NEXT: v_bfe_i32 v23, v17, 0, 1
; GFX8-NEXT: v_bfe_i32 v21, v15, 0, 1
-; GFX8-NEXT: s_add_u32 s6, s0, 0x50
+; GFX8-NEXT: s_add_u32 s6, s4, 0x50
; GFX8-NEXT: v_ashrrev_i32_e32 v24, 31, v23
; GFX8-NEXT: v_ashrrev_i32_e32 v22, 31, v21
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[25:26], v[21:24]
; GFX8-NEXT: v_bfe_i32 v25, v14, 0, 1
; GFX8-NEXT: v_bfe_i32 v23, v13, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v14, s7
; GFX8-NEXT: v_mov_b32_e32 v13, s6
-; GFX8-NEXT: s_add_u32 s6, s0, 64
+; GFX8-NEXT: s_add_u32 s6, s4, 64
; GFX8-NEXT: v_ashrrev_i32_e32 v26, 31, v25
; GFX8-NEXT: v_ashrrev_i32_e32 v24, 31, v23
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[13:14], v[23:26]
; GFX8-NEXT: v_bfe_i32 v12, v12, 0, 1
; GFX8-NEXT: v_bfe_i32 v25, v10, 0, 1
; GFX8-NEXT: v_bfe_i32 v23, v9, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v10, s7
; GFX8-NEXT: v_mov_b32_e32 v9, s6
-; GFX8-NEXT: s_add_u32 s6, s0, 48
+; GFX8-NEXT: s_add_u32 s6, s4, 48
; GFX8-NEXT: v_ashrrev_i32_e32 v26, 31, v25
; GFX8-NEXT: v_ashrrev_i32_e32 v24, 31, v23
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[9:10], v[23:26]
; GFX8-NEXT: v_bfe_i32 v10, v11, 0, 1
; GFX8-NEXT: v_bfe_i32 v25, v8, 0, 1
@@ -7194,18 +7194,18 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX8-NEXT: v_ashrrev_i32_e32 v26, 31, v25
; GFX8-NEXT: v_ashrrev_i32_e32 v24, 31, v23
; GFX8-NEXT: v_mov_b32_e32 v8, s6
-; GFX8-NEXT: s_add_u32 s6, s0, 32
+; GFX8-NEXT: s_add_u32 s6, s4, 32
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[23:26]
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: v_bfe_i32 v25, v5, 0, 1
; GFX8-NEXT: v_bfe_i32 v23, v4, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v4, s6
; GFX8-NEXT: v_ashrrev_i32_e32 v26, 31, v25
; GFX8-NEXT: v_ashrrev_i32_e32 v24, 31, v23
; GFX8-NEXT: v_mov_b32_e32 v5, s7
-; GFX8-NEXT: s_add_u32 s6, s0, 16
+; GFX8-NEXT: s_add_u32 s6, s4, 16
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[23:26]
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: v_bfe_i32 v25, v1, 0, 1
; GFX8-NEXT: v_bfe_i32 v23, v0, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v0, s6
@@ -7214,44 +7214,44 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_bfe_i32 v6, v7, 0, 1
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[23:26]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v6
-; GFX8-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NEXT: v_mov_b32_e32 v5, s5
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: s_add_u32 s4, s0, 0xf0
+; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: s_add_u32 s2, s4, 0xf0
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
-; GFX8-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_ashrrev_i32_e32 v13, 31, v12
; GFX8-NEXT: v_ashrrev_i32_e32 v11, 31, v10
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_add_u32 s4, s0, 0xe0
-; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[10:13]
-; GFX8-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_add_u32 s4, s0, 0xd0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: s_add_u32 s2, s4, 0xe0
; GFX8-NEXT: v_bfe_i32 v17, v18, 0, 1
; GFX8-NEXT: v_bfe_i32 v15, v16, 0, 1
-; GFX8-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[10:13]
+; GFX8-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_ashrrev_i32_e32 v18, 31, v17
; GFX8-NEXT: v_ashrrev_i32_e32 v16, 31, v15
-; GFX8-NEXT: s_add_u32 s0, s0, 0xc0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: s_add_u32 s2, s4, 0xd0
; GFX8-NEXT: v_bfe_i32 v21, v20, 0, 1
; GFX8-NEXT: v_bfe_i32 v19, v19, 0, 1
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[15:18]
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NEXT: v_bfe_i32 v2, v27, 0, 1
+; GFX8-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_ashrrev_i32_e32 v22, 31, v21
; GFX8-NEXT: v_ashrrev_i32_e32 v20, 31, v19
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[19:22]
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0xc0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_bfe_i32 v2, v27, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[19:22]
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
@@ -7448,42 +7448,42 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_sextload_v32i1_to_v32i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v26, 6, s2
-; GFX12-NEXT: v_lshrrev_b16 v28, 7, s2
-; GFX12-NEXT: v_lshrrev_b16 v4, 2, s2
-; GFX12-NEXT: v_lshrrev_b16 v5, 3, s2
-; GFX12-NEXT: s_lshr_b32 s22, s2, 24
-; GFX12-NEXT: s_lshr_b32 s12, s2, 22
-; GFX12-NEXT: s_lshr_b32 s14, s2, 23
-; GFX12-NEXT: v_lshrrev_b16 v6, 4, s2
-; GFX12-NEXT: v_lshrrev_b16 v8, 5, s2
-; GFX12-NEXT: v_lshrrev_b16 v3, 1, s2
-; GFX12-NEXT: s_lshr_b32 s16, s2, 20
-; GFX12-NEXT: s_lshr_b32 s18, s2, 21
-; GFX12-NEXT: v_lshrrev_b16 v1, 14, s2
-; GFX12-NEXT: v_lshrrev_b16 v2, 15, s2
+; GFX12-NEXT: v_lshrrev_b16 v26, 6, s0
+; GFX12-NEXT: v_lshrrev_b16 v28, 7, s0
+; GFX12-NEXT: v_lshrrev_b16 v4, 2, s0
+; GFX12-NEXT: v_lshrrev_b16 v5, 3, s0
+; GFX12-NEXT: s_lshr_b32 s22, s0, 24
+; GFX12-NEXT: s_lshr_b32 s12, s0, 22
+; GFX12-NEXT: s_lshr_b32 s14, s0, 23
+; GFX12-NEXT: v_lshrrev_b16 v6, 4, s0
+; GFX12-NEXT: v_lshrrev_b16 v8, 5, s0
+; GFX12-NEXT: v_lshrrev_b16 v3, 1, s0
+; GFX12-NEXT: s_lshr_b32 s16, s0, 20
+; GFX12-NEXT: s_lshr_b32 s18, s0, 21
+; GFX12-NEXT: v_lshrrev_b16 v1, 14, s0
+; GFX12-NEXT: v_lshrrev_b16 v2, 15, s0
; GFX12-NEXT: v_lshrrev_b16 v12, 6, s22
; GFX12-NEXT: v_lshrrev_b16 v14, 7, s22
-; GFX12-NEXT: v_lshrrev_b16 v9, 12, s2
-; GFX12-NEXT: v_lshrrev_b16 v10, 13, s2
+; GFX12-NEXT: v_lshrrev_b16 v9, 12, s0
+; GFX12-NEXT: v_lshrrev_b16 v10, 13, s0
; GFX12-NEXT: v_lshrrev_b16 v16, 4, s22
; GFX12-NEXT: v_lshrrev_b16 v17, 5, s22
; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000
; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000
-; GFX12-NEXT: s_lshr_b32 s4, s2, 18
-; GFX12-NEXT: v_lshrrev_b16 v37, 10, s2
-; GFX12-NEXT: v_lshrrev_b16 v34, 11, s2
+; GFX12-NEXT: s_lshr_b32 s2, s0, 18
+; GFX12-NEXT: v_lshrrev_b16 v37, 10, s0
+; GFX12-NEXT: v_lshrrev_b16 v34, 11, s0
; GFX12-NEXT: v_lshrrev_b16 v13, 2, s22
; GFX12-NEXT: v_lshrrev_b16 v15, 3, s22
; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000
; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v49, s12
-; GFX12-NEXT: v_lshrrev_b16 v30, 8, s2
-; GFX12-NEXT: v_lshrrev_b16 v32, 9, s2
+; GFX12-NEXT: v_lshrrev_b16 v30, 8, s0
+; GFX12-NEXT: v_lshrrev_b16 v32, 9, s0
; GFX12-NEXT: v_lshrrev_b16 v11, 1, s22
; GFX12-NEXT: v_bfe_i32 v7, v5, 0, 1
; GFX12-NEXT: v_bfe_i32 v5, v4, 0, 1
@@ -7491,15 +7491,15 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_bfe_i32 v29, v26, 0, 1
; GFX12-NEXT: v_dual_mov_b32 v50, s13 :: v_dual_mov_b32 v51, s14
; GFX12-NEXT: v_dual_mov_b32 v52, s15 :: v_dual_mov_b32 v53, s16
-; GFX12-NEXT: s_lshr_b32 s6, s2, 19
+; GFX12-NEXT: s_lshr_b32 s6, s0, 19
; GFX12-NEXT: v_bfe_i32 v3, v3, 0, 1
; GFX12-NEXT: v_bfe_i32 v27, v8, 0, 1
; GFX12-NEXT: v_bfe_i32 v25, v6, 0, 1
; GFX12-NEXT: v_dual_mov_b32 v54, s17 :: v_dual_mov_b32 v55, s18
-; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
; GFX12-NEXT: v_mov_b32_e32 v56, s19
-; GFX12-NEXT: s_lshr_b32 s10, s2, 16
-; GFX12-NEXT: s_lshr_b32 s20, s2, 17
+; GFX12-NEXT: s_lshr_b32 s10, s0, 16
+; GFX12-NEXT: s_lshr_b32 s20, s0, 17
; GFX12-NEXT: v_bfe_i32 v23, v14, 0, 1
; GFX12-NEXT: v_bfe_i32 v21, v12, 0, 1
; GFX12-NEXT: v_bfe_i32 v47, v2, 0, 1
@@ -7509,7 +7509,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_bfe_i32 v43, v10, 0, 1
; GFX12-NEXT: v_bfe_i32 v41, v9, 0, 1
; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
-; GFX12-NEXT: s_bfe_i64 s[8:9], s[2:3], 0x10000
+; GFX12-NEXT: s_bfe_i64 s[8:9], s[0:1], 0x10000
; GFX12-NEXT: v_bfe_i32 v15, v15, 0, 1
; GFX12-NEXT: v_bfe_i32 v13, v13, 0, 1
; GFX12-NEXT: v_bfe_i32 v39, v34, 0, 1
@@ -7524,18 +7524,18 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_bfe_i32 v33, v30, 0, 1
; GFX12-NEXT: v_ashrrev_i32_e32 v30, 31, v29
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v0, v[49:52], s[0:1] offset:176
-; GFX12-NEXT: global_store_b128 v0, v[53:56], s[0:1] offset:160
-; GFX12-NEXT: v_dual_mov_b32 v49, s4 :: v_dual_mov_b32 v50, s5
+; GFX12-NEXT: global_store_b128 v0, v[49:52], s[4:5] offset:176
+; GFX12-NEXT: global_store_b128 v0, v[53:56], s[4:5] offset:160
+; GFX12-NEXT: v_dual_mov_b32 v49, s2 :: v_dual_mov_b32 v50, s3
; GFX12-NEXT: v_dual_mov_b32 v51, s6 :: v_dual_mov_b32 v52, s7
; GFX12-NEXT: v_mov_b32_e32 v53, s10
-; GFX12-NEXT: s_bfe_i64 s[2:3], s[22:23], 0x10000
+; GFX12-NEXT: s_bfe_i64 s[0:1], s[22:23], 0x10000
; GFX12-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX12-NEXT: v_ashrrev_i32_e32 v28, 31, v27
; GFX12-NEXT: v_ashrrev_i32_e32 v26, 31, v25
; GFX12-NEXT: v_dual_mov_b32 v54, s11 :: v_dual_mov_b32 v55, s20
; GFX12-NEXT: v_dual_mov_b32 v56, s21 :: v_dual_mov_b32 v1, s8
-; GFX12-NEXT: v_dual_mov_b32 v2, s9 :: v_dual_mov_b32 v9, s2
+; GFX12-NEXT: v_dual_mov_b32 v2, s9 :: v_dual_mov_b32 v9, s0
; GFX12-NEXT: v_ashrrev_i32_e32 v24, 31, v23
; GFX12-NEXT: v_ashrrev_i32_e32 v22, 31, v21
; GFX12-NEXT: v_ashrrev_i32_e32 v48, 31, v47
@@ -7552,22 +7552,22 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_ashrrev_i32_e32 v36, 31, v35
; GFX12-NEXT: v_ashrrev_i32_e32 v34, 31, v33
; GFX12-NEXT: s_clause 0x7
-; GFX12-NEXT: global_store_b128 v0, v[49:52], s[0:1] offset:144
-; GFX12-NEXT: global_store_b128 v0, v[53:56], s[0:1] offset:128
-; GFX12-NEXT: global_store_b128 v0, v[45:48], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v0, v[41:44], s[0:1] offset:96
-; GFX12-NEXT: global_store_b128 v0, v[37:40], s[0:1] offset:80
-; GFX12-NEXT: global_store_b128 v0, v[33:36], s[0:1] offset:64
-; GFX12-NEXT: global_store_b128 v0, v[29:32], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v0, v[25:28], s[0:1] offset:32
-; GFX12-NEXT: v_mov_b32_e32 v10, s3
+; GFX12-NEXT: global_store_b128 v0, v[49:52], s[4:5] offset:144
+; GFX12-NEXT: global_store_b128 v0, v[53:56], s[4:5] offset:128
+; GFX12-NEXT: global_store_b128 v0, v[45:48], s[4:5] offset:112
+; GFX12-NEXT: global_store_b128 v0, v[41:44], s[4:5] offset:96
+; GFX12-NEXT: global_store_b128 v0, v[37:40], s[4:5] offset:80
+; GFX12-NEXT: global_store_b128 v0, v[33:36], s[4:5] offset:64
+; GFX12-NEXT: global_store_b128 v0, v[29:32], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v0, v[25:28], s[4:5] offset:32
+; GFX12-NEXT: v_mov_b32_e32 v10, s1
; GFX12-NEXT: s_clause 0x5
-; GFX12-NEXT: global_store_b128 v0, v[5:8], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v0, v[1:4], s[0:1]
-; GFX12-NEXT: global_store_b128 v0, v[21:24], s[0:1] offset:240
-; GFX12-NEXT: global_store_b128 v0, v[17:20], s[0:1] offset:224
-; GFX12-NEXT: global_store_b128 v0, v[13:16], s[0:1] offset:208
-; GFX12-NEXT: global_store_b128 v0, v[9:12], s[0:1] offset:192
+; GFX12-NEXT: global_store_b128 v0, v[5:8], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v0, v[1:4], s[4:5]
+; GFX12-NEXT: global_store_b128 v0, v[21:24], s[4:5] offset:240
+; GFX12-NEXT: global_store_b128 v0, v[17:20], s[4:5] offset:224
+; GFX12-NEXT: global_store_b128 v0, v[13:16], s[4:5] offset:208
+; GFX12-NEXT: global_store_b128 v0, v[9:12], s[4:5] offset:192
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -7783,159 +7783,159 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
;
; GFX8-LABEL: constant_zextload_v64i1_to_v64i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 13, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 13, s0
; GFX8-NEXT: v_and_b32_e32 v18, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 11, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 11, s0
; GFX8-NEXT: v_and_b32_e32 v16, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 9, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 9, s0
; GFX8-NEXT: v_and_b32_e32 v15, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 7, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 7, s0
; GFX8-NEXT: v_and_b32_e32 v13, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 5, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 5, s0
; GFX8-NEXT: v_and_b32_e32 v10, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 3, s2
-; GFX8-NEXT: v_mov_b32_e32 v12, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v2, 14, s2
-; GFX8-NEXT: s_lshr_b32 s33, s3, 24
-; GFX8-NEXT: s_lshr_b32 s24, s2, 24
-; GFX8-NEXT: v_lshrrev_b16_e64 v19, 12, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v17, 10, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v14, 6, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v11, 4, s2
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 3, s0
+; GFX8-NEXT: v_mov_b32_e32 v12, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v2, 14, s0
+; GFX8-NEXT: s_lshr_b32 s33, s1, 24
+; GFX8-NEXT: s_lshr_b32 s24, s0, 24
+; GFX8-NEXT: v_lshrrev_b16_e64 v19, 12, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v17, 10, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v14, 6, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v11, 4, s0
; GFX8-NEXT: v_and_b32_e32 v8, 1, v0
-; GFX8-NEXT: v_lshrrev_b16_e64 v9, 2, s2
-; GFX8-NEXT: v_lshrrev_b16_e64 v0, 1, s2
-; GFX8-NEXT: s_bfe_u32 s20, s2, 0x10018
-; GFX8-NEXT: s_bfe_u32 s21, s3, 0x10018
-; GFX8-NEXT: s_and_b32 s22, s3, 1
-; GFX8-NEXT: s_and_b32 s23, s2, 1
-; GFX8-NEXT: v_lshrrev_b16_e64 v4, 15, s2
-; GFX8-NEXT: s_bfe_u32 s25, s2, 0x10011
-; GFX8-NEXT: s_bfe_u32 s26, s2, 0x10010
-; GFX8-NEXT: s_bfe_u32 s27, s2, 0x10012
-; GFX8-NEXT: s_bfe_u32 s28, s2, 0x10013
-; GFX8-NEXT: s_bfe_u32 s29, s2, 0x10014
-; GFX8-NEXT: s_bfe_u32 s30, s2, 0x10015
-; GFX8-NEXT: s_bfe_u32 s31, s2, 0x10016
-; GFX8-NEXT: s_bfe_u32 s2, s2, 0x10017
-; GFX8-NEXT: s_bfe_u32 s34, s3, 0x10011
-; GFX8-NEXT: s_bfe_u32 s35, s3, 0x10010
-; GFX8-NEXT: s_bfe_u32 s36, s3, 0x10012
-; GFX8-NEXT: s_bfe_u32 s37, s3, 0x10013
-; GFX8-NEXT: s_bfe_u32 s38, s3, 0x10016
-; GFX8-NEXT: s_bfe_u32 s39, s3, 0x10017
-; GFX8-NEXT: s_bfe_u32 s40, s3, 0x10015
-; GFX8-NEXT: s_bfe_u32 s41, s3, 0x10014
-; GFX8-NEXT: s_add_u32 s4, s0, 0x1a0
-; GFX8-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NEXT: s_add_u32 s6, s0, 0x1b0
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
-; GFX8-NEXT: s_add_u32 s8, s0, 0x190
-; GFX8-NEXT: s_addc_u32 s9, s1, 0
-; GFX8-NEXT: s_add_u32 s10, s0, 0x180
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
-; GFX8-NEXT: s_add_u32 s12, s0, 0xb0
-; GFX8-NEXT: s_addc_u32 s13, s1, 0
-; GFX8-NEXT: s_add_u32 s14, s0, 0xa0
-; GFX8-NEXT: s_addc_u32 s15, s1, 0
-; GFX8-NEXT: s_add_u32 s16, s0, 0x90
-; GFX8-NEXT: s_addc_u32 s17, s1, 0
-; GFX8-NEXT: s_add_u32 s18, s0, 0x80
-; GFX8-NEXT: s_addc_u32 s19, s1, 0
-; GFX8-NEXT: v_lshrrev_b16_e64 v1, 13, s3
-; GFX8-NEXT: s_add_u32 s42, s0, 0x70
+; GFX8-NEXT: v_lshrrev_b16_e64 v9, 2, s0
+; GFX8-NEXT: v_lshrrev_b16_e64 v0, 1, s0
+; GFX8-NEXT: s_bfe_u32 s20, s0, 0x10018
+; GFX8-NEXT: s_bfe_u32 s21, s1, 0x10018
+; GFX8-NEXT: s_and_b32 s22, s1, 1
+; GFX8-NEXT: s_and_b32 s23, s0, 1
+; GFX8-NEXT: v_lshrrev_b16_e64 v4, 15, s0
+; GFX8-NEXT: s_bfe_u32 s25, s0, 0x10011
+; GFX8-NEXT: s_bfe_u32 s26, s0, 0x10010
+; GFX8-NEXT: s_bfe_u32 s27, s0, 0x10012
+; GFX8-NEXT: s_bfe_u32 s28, s0, 0x10013
+; GFX8-NEXT: s_bfe_u32 s29, s0, 0x10014
+; GFX8-NEXT: s_bfe_u32 s30, s0, 0x10015
+; GFX8-NEXT: s_bfe_u32 s31, s0, 0x10016
+; GFX8-NEXT: s_bfe_u32 s0, s0, 0x10017
+; GFX8-NEXT: s_bfe_u32 s34, s1, 0x10011
+; GFX8-NEXT: s_bfe_u32 s35, s1, 0x10010
+; GFX8-NEXT: s_bfe_u32 s36, s1, 0x10012
+; GFX8-NEXT: s_bfe_u32 s37, s1, 0x10013
+; GFX8-NEXT: s_bfe_u32 s38, s1, 0x10016
+; GFX8-NEXT: s_bfe_u32 s39, s1, 0x10017
+; GFX8-NEXT: s_bfe_u32 s40, s1, 0x10015
+; GFX8-NEXT: s_bfe_u32 s41, s1, 0x10014
+; GFX8-NEXT: s_add_u32 s2, s4, 0x1a0
+; GFX8-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NEXT: s_add_u32 s6, s4, 0x1b0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
+; GFX8-NEXT: s_add_u32 s8, s4, 0x190
+; GFX8-NEXT: s_addc_u32 s9, s5, 0
+; GFX8-NEXT: s_add_u32 s10, s4, 0x180
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
+; GFX8-NEXT: s_add_u32 s12, s4, 0xb0
+; GFX8-NEXT: s_addc_u32 s13, s5, 0
+; GFX8-NEXT: s_add_u32 s14, s4, 0xa0
+; GFX8-NEXT: s_addc_u32 s15, s5, 0
+; GFX8-NEXT: s_add_u32 s16, s4, 0x90
+; GFX8-NEXT: s_addc_u32 s17, s5, 0
+; GFX8-NEXT: s_add_u32 s18, s4, 0x80
+; GFX8-NEXT: s_addc_u32 s19, s5, 0
+; GFX8-NEXT: v_lshrrev_b16_e64 v1, 13, s1
+; GFX8-NEXT: s_add_u32 s42, s4, 0x70
; GFX8-NEXT: v_and_b32_e32 v7, 1, v1
; GFX8-NEXT: v_mov_b32_e32 v1, 0
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v23, s42
; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
; GFX8-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NEXT: v_mov_b32_e32 v5, v1
; GFX8-NEXT: v_mov_b32_e32 v24, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 0x170
-; GFX8-NEXT: v_lshrrev_b16_e64 v22, 14, s3
+; GFX8-NEXT: s_add_u32 s42, s4, 0x170
+; GFX8-NEXT: v_lshrrev_b16_e64 v22, 14, s1
; GFX8-NEXT: flat_store_dwordx4 v[23:24], v[2:5]
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v2, s42
; GFX8-NEXT: v_and_b32_e32 v22, 1, v22
-; GFX8-NEXT: v_lshrrev_b16_e64 v24, 15, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v24, 15, s1
; GFX8-NEXT: v_mov_b32_e32 v23, v1
; GFX8-NEXT: v_mov_b32_e32 v25, v1
; GFX8-NEXT: v_mov_b32_e32 v3, s43
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[22:25]
-; GFX8-NEXT: v_lshrrev_b16_e64 v2, 11, s3
-; GFX8-NEXT: s_add_u32 s42, s0, 0x1f0
+; GFX8-NEXT: v_lshrrev_b16_e64 v2, 11, s1
+; GFX8-NEXT: s_add_u32 s42, s4, 0x1f0
; GFX8-NEXT: v_lshrrev_b16_e64 v21, 6, s33
; GFX8-NEXT: v_and_b32_e32 v4, 1, v2
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v2, s42
; GFX8-NEXT: v_and_b32_e32 v21, 1, v21
; GFX8-NEXT: v_lshrrev_b16_e64 v23, 7, s33
; GFX8-NEXT: v_mov_b32_e32 v22, v1
; GFX8-NEXT: v_mov_b32_e32 v24, v1
; GFX8-NEXT: v_mov_b32_e32 v3, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 0xf0
+; GFX8-NEXT: s_add_u32 s42, s4, 0xf0
; GFX8-NEXT: v_lshrrev_b16_e64 v20, 6, s24
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[21:24]
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v2, s42
; GFX8-NEXT: v_and_b32_e32 v22, 1, v20
; GFX8-NEXT: v_lshrrev_b16_e64 v24, 7, s24
; GFX8-NEXT: v_mov_b32_e32 v23, v1
; GFX8-NEXT: v_mov_b32_e32 v3, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 0x60
+; GFX8-NEXT: s_add_u32 s42, s4, 0x60
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[22:25]
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_and_b32_e32 v22, 1, v19
; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v18
; GFX8-NEXT: v_mov_b32_e32 v18, s42
; GFX8-NEXT: v_mov_b32_e32 v19, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 0x50
+; GFX8-NEXT: s_add_u32 s42, s4, 0x50
; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[22:25]
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_and_b32_e32 v22, 1, v17
; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v16
; GFX8-NEXT: v_mov_b32_e32 v16, s42
; GFX8-NEXT: v_mov_b32_e32 v17, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 64
+; GFX8-NEXT: s_add_u32 s42, s4, 64
; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v17, 1
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v26, s42
; GFX8-NEXT: v_and_b32_sdwa v22, v12, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v15
; GFX8-NEXT: v_mov_b32_e32 v27, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 48
+; GFX8-NEXT: s_add_u32 s42, s4, 48
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_and_b32_e32 v22, 1, v14
; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v13
; GFX8-NEXT: v_mov_b32_e32 v13, s42
; GFX8-NEXT: v_mov_b32_e32 v14, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 32
+; GFX8-NEXT: s_add_u32 s42, s4, 32
; GFX8-NEXT: flat_store_dwordx4 v[13:14], v[22:25]
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_and_b32_e32 v22, 1, v11
; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v10
; GFX8-NEXT: v_mov_b32_e32 v10, s42
; GFX8-NEXT: v_mov_b32_e32 v11, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 16
+; GFX8-NEXT: s_add_u32 s42, s4, 16
; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[22:25]
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_and_b32_e32 v22, 1, v9
; GFX8-NEXT: v_and_b32_e32 v24, 0xffff, v8
; GFX8-NEXT: v_mov_b32_e32 v8, s42
; GFX8-NEXT: v_mov_b32_e32 v9, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 0x160
-; GFX8-NEXT: v_lshrrev_b16_e64 v5, 12, s3
+; GFX8-NEXT: s_add_u32 s42, s4, 0x160
+; GFX8-NEXT: v_lshrrev_b16_e64 v5, 12, s1
; GFX8-NEXT: v_lshrrev_b16_e64 v10, 3, s33
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[22:25]
; GFX8-NEXT: v_lshrrev_b16_e64 v8, 1, s33
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v22, s42
; GFX8-NEXT: v_and_b32_e32 v28, 1, v10
; GFX8-NEXT: v_and_b32_e32 v19, 1, v8
@@ -7945,12 +7945,12 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v11, v1
; GFX8-NEXT: v_mov_b32_e32 v23, s43
; GFX8-NEXT: v_lshrrev_b16_e64 v5, 5, s24
-; GFX8-NEXT: s_add_u32 s42, s0, 0x150
-; GFX8-NEXT: v_lshrrev_b16_e64 v21, 10, s3
+; GFX8-NEXT: s_add_u32 s42, s4, 0x150
+; GFX8-NEXT: v_lshrrev_b16_e64 v21, 10, s1
; GFX8-NEXT: flat_store_dwordx4 v[22:23], v[8:11]
; GFX8-NEXT: v_and_b32_e32 v22, 1, v5
; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v4
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v4, s42
; GFX8-NEXT: v_and_b32_e32 v7, 1, v21
; GFX8-NEXT: v_mov_b32_e32 v8, v1
@@ -7958,28 +7958,28 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v5, s43
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[7:10]
; GFX8-NEXT: v_lshrrev_b16_e64 v4, 3, s24
-; GFX8-NEXT: v_lshrrev_b16_e64 v2, 9, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v2, 9, s1
; GFX8-NEXT: v_and_b32_e32 v10, 1, v4
; GFX8-NEXT: v_lshrrev_b16_e64 v4, 1, s24
-; GFX8-NEXT: s_add_u32 s42, s0, 0x140
-; GFX8-NEXT: v_mov_b32_e32 v6, s3
+; GFX8-NEXT: s_add_u32 s42, s4, 0x140
+; GFX8-NEXT: v_mov_b32_e32 v6, s1
; GFX8-NEXT: v_and_b32_e32 v20, 1, v2
; GFX8-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v8, s42
-; GFX8-NEXT: v_lshrrev_b16_e64 v2, 7, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v18, 6, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v2, 7, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v18, 6, s1
; GFX8-NEXT: v_and_b32_e32 v11, 0xffff, v4
; GFX8-NEXT: v_and_b32_sdwa v4, v6, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff, v20
; GFX8-NEXT: v_mov_b32_e32 v5, v1
; GFX8-NEXT: v_mov_b32_e32 v7, v1
; GFX8-NEXT: v_mov_b32_e32 v9, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 0x130
+; GFX8-NEXT: s_add_u32 s42, s4, 0x130
; GFX8-NEXT: v_and_b32_e32 v2, 1, v2
-; GFX8-NEXT: v_lshrrev_b16_e64 v3, 5, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v3, 5, s1
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_and_b32_e32 v7, 1, v18
; GFX8-NEXT: v_mov_b32_e32 v17, s42
; GFX8-NEXT: v_and_b32_e32 v3, 1, v3
@@ -7988,25 +7988,25 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v8, v1
; GFX8-NEXT: v_mov_b32_e32 v10, v1
; GFX8-NEXT: v_mov_b32_e32 v18, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 0x120
-; GFX8-NEXT: v_lshrrev_b16_e64 v16, 4, s3
+; GFX8-NEXT: s_add_u32 s42, s4, 0x120
+; GFX8-NEXT: v_lshrrev_b16_e64 v16, 4, s1
; GFX8-NEXT: flat_store_dwordx4 v[17:18], v[7:10]
; GFX8-NEXT: v_and_b32_e32 v18, 0xffff, v3
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v2, s42
-; GFX8-NEXT: v_lshrrev_b16_e64 v12, 3, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v13, 1, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v12, 3, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v13, 1, s1
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff, v19
; GFX8-NEXT: v_and_b32_e32 v16, 1, v16
; GFX8-NEXT: v_mov_b32_e32 v17, v1
; GFX8-NEXT: v_mov_b32_e32 v19, v1
; GFX8-NEXT: v_mov_b32_e32 v3, s43
-; GFX8-NEXT: s_add_u32 s42, s0, 0x110
+; GFX8-NEXT: s_add_u32 s42, s4, 0x110
; GFX8-NEXT: v_and_b32_e32 v12, 1, v12
-; GFX8-NEXT: v_lshrrev_b16_e64 v15, 2, s3
+; GFX8-NEXT: v_lshrrev_b16_e64 v15, 2, s1
; GFX8-NEXT: v_and_b32_e32 v13, 1, v13
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[16:19]
-; GFX8-NEXT: s_addc_u32 s43, s1, 0
+; GFX8-NEXT: s_addc_u32 s43, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v2, s42
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: v_and_b32_e32 v17, 1, v15
@@ -8015,13 +8015,13 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v20, v1
; GFX8-NEXT: v_mov_b32_e32 v3, s43
; GFX8-NEXT: v_and_b32_e32 v8, 0xffff, v13
-; GFX8-NEXT: v_mov_b32_e32 v13, s5
+; GFX8-NEXT: v_mov_b32_e32 v13, s3
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[17:20]
; GFX8-NEXT: v_and_b32_e32 v10, 0xffff, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s41
; GFX8-NEXT: v_mov_b32_e32 v2, s40
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v12, s4
+; GFX8-NEXT: v_mov_b32_e32 v12, s2
; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v13, s7
; GFX8-NEXT: v_mov_b32_e32 v0, s38
@@ -8040,7 +8040,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v12, s12
; GFX8-NEXT: v_mov_b32_e32 v0, s31
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: v_mov_b32_e32 v13, s13
; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
; GFX8-NEXT: v_mov_b32_e32 v12, s14
@@ -8058,66 +8058,66 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_mov_b32_e32 v2, s25
; GFX8-NEXT: v_mov_b32_e32 v13, s19
; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v13, s1
-; GFX8-NEXT: s_add_u32 s2, s0, 0x100
+; GFX8-NEXT: v_mov_b32_e32 v13, s5
+; GFX8-NEXT: s_add_u32 s0, s4, 0x100
; GFX8-NEXT: v_mov_b32_e32 v0, s23
; GFX8-NEXT: v_mov_b32_e32 v2, v10
-; GFX8-NEXT: v_mov_b32_e32 v12, s0
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v12, s4
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
-; GFX8-NEXT: v_mov_b32_e32 v13, s3
-; GFX8-NEXT: v_lshrrev_b16_e64 v14, 5, s33
+; GFX8-NEXT: v_mov_b32_e32 v13, s1
+; GFX8-NEXT: v_mov_b32_e32 v12, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0x1e0
; GFX8-NEXT: v_mov_b32_e32 v0, s22
; GFX8-NEXT: v_mov_b32_e32 v2, v8
-; GFX8-NEXT: v_mov_b32_e32 v12, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 0x1e0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NEXT: v_lshrrev_b16_e64 v14, 5, s33
+; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
; GFX8-NEXT: v_and_b32_e32 v26, 1, v14
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_lshrrev_b16_e64 v27, 4, s33
-; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0x1d0
; GFX8-NEXT: v_and_b32_e32 v17, 1, v27
; GFX8-NEXT: v_and_b32_e32 v19, 0xffff, v26
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0x1d0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[17:20]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0x1c0
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0x1c0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_lshrrev_b16_e64 v14, 2, s33
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v13, s1
; GFX8-NEXT: v_and_b32_e32 v14, 1, v14
; GFX8-NEXT: v_and_b32_e32 v16, 0xffff, v28
; GFX8-NEXT: v_mov_b32_e32 v15, v1
; GFX8-NEXT: v_mov_b32_e32 v17, v1
-; GFX8-NEXT: v_mov_b32_e32 v13, s3
+; GFX8-NEXT: v_mov_b32_e32 v12, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0xe0
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[14:17]
; GFX8-NEXT: v_mov_b32_e32 v0, s21
; GFX8-NEXT: v_mov_b32_e32 v2, v5
; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v12, s2
-; GFX8-NEXT: s_add_u32 s2, s0, 0xe0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_lshrrev_b16_e64 v23, 4, s24
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_add_u32 s2, s0, 0xd0
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0xd0
; GFX8-NEXT: v_and_b32_e32 v7, 1, v23
; GFX8-NEXT: v_and_b32_e32 v9, 0xffff, v22
; GFX8-NEXT: v_mov_b32_e32 v8, v1
; GFX8-NEXT: v_mov_b32_e32 v10, v1
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_lshrrev_b16_e64 v21, 2, s24
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[7:10]
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: s_add_u32 s0, s0, 0xc0
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_lshrrev_b16_e64 v21, 2, s24
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0xc0
; GFX8-NEXT: v_and_b32_e32 v4, 1, v21
; GFX8-NEXT: v_mov_b32_e32 v5, v1
; GFX8-NEXT: v_mov_b32_e32 v7, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[2:3], v[4:7]
; GFX8-NEXT: v_mov_b32_e32 v0, s20
; GFX8-NEXT: v_mov_b32_e32 v5, s1
@@ -8434,58 +8434,58 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_zextload_v64i1_to_v64i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v4, 11, s2
-; GFX12-NEXT: v_lshrrev_b16 v6, 7, s2
-; GFX12-NEXT: s_lshr_b32 s4, s3, 24
+; GFX12-NEXT: v_lshrrev_b16 v4, 11, s0
+; GFX12-NEXT: v_lshrrev_b16 v6, 7, s0
+; GFX12-NEXT: s_lshr_b32 s2, s1, 24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
-; GFX12-NEXT: v_lshrrev_b16 v14, 13, s3
+; GFX12-NEXT: v_lshrrev_b16 v14, 13, s1
; GFX12-NEXT: v_and_b32_e32 v34, 1, v4
-; GFX12-NEXT: v_lshrrev_b16 v18, 9, s3
+; GFX12-NEXT: v_lshrrev_b16 v18, 9, s1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX12-NEXT: v_dual_mov_b32 v28, v1 :: v_dual_and_b32 v41, 1, v6
-; GFX12-NEXT: v_lshrrev_b16 v4, 5, s4
-; GFX12-NEXT: v_lshrrev_b16 v6, 3, s4
-; GFX12-NEXT: s_lshr_b32 s5, s2, 24
-; GFX12-NEXT: v_lshrrev_b16 v3, 13, s2
-; GFX12-NEXT: v_lshrrev_b16 v8, 5, s2
-; GFX12-NEXT: v_lshrrev_b16 v10, 3, s2
-; GFX12-NEXT: v_lshrrev_b16 v23, 7, s3
-; GFX12-NEXT: v_lshrrev_b16 v24, 5, s3
-; GFX12-NEXT: v_lshrrev_b16 v25, 3, s3
+; GFX12-NEXT: v_lshrrev_b16 v4, 5, s2
+; GFX12-NEXT: v_lshrrev_b16 v6, 3, s2
+; GFX12-NEXT: s_lshr_b32 s3, s0, 24
+; GFX12-NEXT: v_lshrrev_b16 v3, 13, s0
+; GFX12-NEXT: v_lshrrev_b16 v8, 5, s0
+; GFX12-NEXT: v_lshrrev_b16 v10, 3, s0
+; GFX12-NEXT: v_lshrrev_b16 v23, 7, s1
+; GFX12-NEXT: v_lshrrev_b16 v24, 5, s1
+; GFX12-NEXT: v_lshrrev_b16 v25, 3, s1
; GFX12-NEXT: v_and_b32_e32 v50, 1, v14
; GFX12-NEXT: v_and_b32_e32 v47, 1, v18
; GFX12-NEXT: v_and_b32_e32 v18, 1, v4
; GFX12-NEXT: v_and_b32_e32 v14, 1, v6
-; GFX12-NEXT: v_lshrrev_b16 v4, 3, s5
-; GFX12-NEXT: v_lshrrev_b16 v6, 5, s5
-; GFX12-NEXT: v_lshrrev_b16 v0, 15, s2
-; GFX12-NEXT: v_lshrrev_b16 v2, 14, s2
-; GFX12-NEXT: v_lshrrev_b16 v7, 12, s2
-; GFX12-NEXT: v_lshrrev_b16 v5, 9, s2
-; GFX12-NEXT: v_lshrrev_b16 v12, 1, s2
+; GFX12-NEXT: v_lshrrev_b16 v4, 3, s3
+; GFX12-NEXT: v_lshrrev_b16 v6, 5, s3
+; GFX12-NEXT: v_lshrrev_b16 v0, 15, s0
+; GFX12-NEXT: v_lshrrev_b16 v2, 14, s0
+; GFX12-NEXT: v_lshrrev_b16 v7, 12, s0
+; GFX12-NEXT: v_lshrrev_b16 v5, 9, s0
+; GFX12-NEXT: v_lshrrev_b16 v12, 1, s0
; GFX12-NEXT: v_and_b32_e32 v3, 1, v3
; GFX12-NEXT: v_and_b32_e32 v42, 1, v8
; GFX12-NEXT: v_and_b32_e32 v52, 1, v10
; GFX12-NEXT: v_and_b32_e32 v40, 1, v23
; GFX12-NEXT: v_dual_mov_b32 v44, v1 :: v_dual_and_b32 v43, 1, v24
-; GFX12-NEXT: v_lshrrev_b16 v8, 1, s5
-; GFX12-NEXT: v_lshrrev_b16 v10, 2, s5
-; GFX12-NEXT: v_lshrrev_b16 v24, 4, s5
-; GFX12-NEXT: s_bfe_u32 s7, s3, 0x10014
+; GFX12-NEXT: v_lshrrev_b16 v8, 1, s3
+; GFX12-NEXT: v_lshrrev_b16 v10, 2, s3
+; GFX12-NEXT: v_lshrrev_b16 v24, 4, s3
+; GFX12-NEXT: s_bfe_u32 s7, s1, 0x10014
; GFX12-NEXT: v_and_b32_e32 v33, 1, v25
; GFX12-NEXT: v_and_b32_e32 v25, 1, v6
-; GFX12-NEXT: s_bfe_u32 s8, s3, 0x10015
+; GFX12-NEXT: s_bfe_u32 s8, s1, 0x10015
; GFX12-NEXT: v_and_b32_e32 v23, 1, v4
-; GFX12-NEXT: v_lshrrev_b16 v11, 8, s2
-; GFX12-NEXT: v_lshrrev_b16 v16, 11, s3
+; GFX12-NEXT: v_lshrrev_b16 v11, 8, s0
+; GFX12-NEXT: v_lshrrev_b16 v16, 11, s1
; GFX12-NEXT: v_dual_mov_b32 v26, v1 :: v_dual_and_b32 v35, 1, v5
; GFX12-NEXT: v_dual_mov_b32 v30, v1 :: v_dual_and_b32 v5, 1, v12
-; GFX12-NEXT: v_lshrrev_b16 v36, 7, s5
-; GFX12-NEXT: v_lshrrev_b16 v37, 6, s5
+; GFX12-NEXT: v_lshrrev_b16 v36, 7, s3
+; GFX12-NEXT: v_lshrrev_b16 v37, 6, s3
; GFX12-NEXT: v_and_b32_e32 v56, 1, v8
; GFX12-NEXT: v_and_b32_e32 v4, 1, v10
; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v23
@@ -8494,16 +8494,16 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v23, 1, v2
; GFX12-NEXT: v_dual_mov_b32 v24, v1 :: v_dual_and_b32 v25, 0xffff, v0
; GFX12-NEXT: v_and_b32_e32 v29, 0xffff, v3
-; GFX12-NEXT: s_bfe_u32 s9, s3, 0x10013
+; GFX12-NEXT: s_bfe_u32 s9, s1, 0x10013
; GFX12-NEXT: v_and_b32_e32 v27, 1, v7
-; GFX12-NEXT: v_lshrrev_b16 v9, 10, s2
-; GFX12-NEXT: v_lshrrev_b16 v13, 6, s2
+; GFX12-NEXT: v_lshrrev_b16 v9, 10, s0
+; GFX12-NEXT: v_lshrrev_b16 v13, 6, s0
; GFX12-NEXT: v_and_b32_e32 v22, 1, v16
-; GFX12-NEXT: v_lshrrev_b16 v54, 1, s3
-; GFX12-NEXT: v_lshrrev_b16 v55, 1, s4
+; GFX12-NEXT: v_lshrrev_b16 v54, 1, s1
+; GFX12-NEXT: v_lshrrev_b16 v55, 1, s2
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v1, v[23:26], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v1, v[27:30], s[0:1] offset:96
+; GFX12-NEXT: global_store_b128 v1, v[23:26], s[4:5] offset:112
+; GFX12-NEXT: global_store_b128 v1, v[27:30], s[4:5] offset:96
; GFX12-NEXT: v_and_b32_e32 v23, 1, v37
; GFX12-NEXT: v_and_b32_e32 v25, 0xffff, v36
; GFX12-NEXT: v_dual_mov_b32 v57, v1 :: v_dual_and_b32 v28, 0xffff, v34
@@ -8512,91 +8512,91 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_dual_mov_b32 v37, v1 :: v_dual_and_b32 v26, 1, v9
; GFX12-NEXT: v_mov_b32_e32 v27, v1
; GFX12-NEXT: v_dual_mov_b32 v29, v1 :: v_dual_and_b32 v0, 1, v55
-; GFX12-NEXT: global_store_b128 v1, v[34:37], s[0:1] offset:64
+; GFX12-NEXT: global_store_b128 v1, v[34:37], s[4:5] offset:64
; GFX12-NEXT: v_and_b32_e32 v34, 1, v13
; GFX12-NEXT: v_and_b32_e32 v36, 0xffff, v41
; GFX12-NEXT: v_and_b32_e32 v2, 1, v54
-; GFX12-NEXT: global_store_b128 v1, v[26:29], s[0:1] offset:80
+; GFX12-NEXT: global_store_b128 v1, v[26:29], s[4:5] offset:80
; GFX12-NEXT: v_and_b32_e32 v30, 0xffff, v0
; GFX12-NEXT: v_mov_b32_e32 v0, s7
-; GFX12-NEXT: global_store_b128 v1, v[34:37], s[0:1] offset:48
+; GFX12-NEXT: global_store_b128 v1, v[34:37], s[4:5] offset:48
; GFX12-NEXT: v_and_b32_e32 v36, 0xffff, v2
; GFX12-NEXT: v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT: s_bfe_u32 s7, s3, 0x10016
-; GFX12-NEXT: s_bfe_u32 s8, s3, 0x10017
-; GFX12-NEXT: v_lshrrev_b16 v20, 15, s3
-; GFX12-NEXT: v_lshrrev_b16 v21, 14, s3
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:416
+; GFX12-NEXT: s_bfe_u32 s7, s1, 0x10016
+; GFX12-NEXT: s_bfe_u32 s8, s1, 0x10017
+; GFX12-NEXT: v_lshrrev_b16 v20, 15, s1
+; GFX12-NEXT: v_lshrrev_b16 v21, 14, s1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:416
; GFX12-NEXT: v_mov_b32_e32 v0, s7
; GFX12-NEXT: v_mov_b32_e32 v2, s8
-; GFX12-NEXT: s_bfe_u32 s8, s3, 0x10012
-; GFX12-NEXT: v_lshrrev_b16 v19, 12, s3
-; GFX12-NEXT: v_lshrrev_b16 v32, 8, s3
-; GFX12-NEXT: v_lshrrev_b16 v38, 6, s3
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:432
+; GFX12-NEXT: s_bfe_u32 s8, s1, 0x10012
+; GFX12-NEXT: v_lshrrev_b16 v19, 12, s1
+; GFX12-NEXT: v_lshrrev_b16 v32, 8, s1
+; GFX12-NEXT: v_lshrrev_b16 v38, 6, s1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:432
; GFX12-NEXT: v_mov_b32_e32 v0, s8
; GFX12-NEXT: v_mov_b32_e32 v2, s9
-; GFX12-NEXT: v_lshrrev_b16 v39, 4, s3
-; GFX12-NEXT: v_lshrrev_b16 v31, 2, s3
-; GFX12-NEXT: v_lshrrev_b16 v28, 10, s3
-; GFX12-NEXT: s_bfe_u32 s5, s3, 0x10018
-; GFX12-NEXT: s_and_b32 s6, s3, 1
-; GFX12-NEXT: s_bfe_u32 s8, s3, 0x10011
-; GFX12-NEXT: s_bfe_u32 s3, s3, 0x10010
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:400
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
+; GFX12-NEXT: v_lshrrev_b16 v39, 4, s1
+; GFX12-NEXT: v_lshrrev_b16 v31, 2, s1
+; GFX12-NEXT: v_lshrrev_b16 v28, 10, s1
+; GFX12-NEXT: s_bfe_u32 s3, s1, 0x10018
+; GFX12-NEXT: s_and_b32 s6, s1, 1
+; GFX12-NEXT: s_bfe_u32 s8, s1, 0x10011
+; GFX12-NEXT: s_bfe_u32 s1, s1, 0x10010
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:400
+; GFX12-NEXT: v_mov_b32_e32 v0, s1
; GFX12-NEXT: v_mov_b32_e32 v2, s8
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10016
-; GFX12-NEXT: s_bfe_u32 s8, s2, 0x10017
-; GFX12-NEXT: v_lshrrev_b16 v15, 4, s2
+; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10016
+; GFX12-NEXT: s_bfe_u32 s8, s0, 0x10017
+; GFX12-NEXT: v_lshrrev_b16 v15, 4, s0
; GFX12-NEXT: v_and_b32_e32 v31, 1, v31
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:384
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:384
; GFX12-NEXT: v_mov_b32_e32 v2, s8
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10014
-; GFX12-NEXT: s_bfe_u32 s8, s2, 0x10015
+; GFX12-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10014
+; GFX12-NEXT: s_bfe_u32 s8, s0, 0x10015
; GFX12-NEXT: v_and_b32_e32 v29, 0xffff, v43
; GFX12-NEXT: v_and_b32_e32 v41, 1, v15
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:176
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:176
; GFX12-NEXT: v_mov_b32_e32 v2, s8
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10012
-; GFX12-NEXT: s_bfe_u32 s8, s2, 0x10013
-; GFX12-NEXT: v_lshrrev_b16 v17, 2, s2
-; GFX12-NEXT: v_lshrrev_b16 v46, 7, s4
-; GFX12-NEXT: v_lshrrev_b16 v49, 6, s4
+; GFX12-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10012
+; GFX12-NEXT: s_bfe_u32 s8, s0, 0x10013
+; GFX12-NEXT: v_lshrrev_b16 v17, 2, s0
+; GFX12-NEXT: v_lshrrev_b16 v46, 7, s2
+; GFX12-NEXT: v_lshrrev_b16 v49, 6, s2
; GFX12-NEXT: v_dual_mov_b32 v26, v1 :: v_dual_and_b32 v43, 0xffff, v42
; GFX12-NEXT: v_dual_mov_b32 v42, v1 :: v_dual_and_b32 v45, 1, v32
; GFX12-NEXT: v_and_b32_e32 v47, 0xffff, v47
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:160
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:160
; GFX12-NEXT: v_mov_b32_e32 v2, s8
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: v_lshrrev_b16 v16, 4, s4
-; GFX12-NEXT: v_lshrrev_b16 v12, 2, s4
-; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10018
-; GFX12-NEXT: s_and_b32 s7, s2, 1
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10011
-; GFX12-NEXT: s_bfe_u32 s2, s2, 0x10010
+; GFX12-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-NEXT: v_lshrrev_b16 v16, 4, s2
+; GFX12-NEXT: v_lshrrev_b16 v12, 2, s2
+; GFX12-NEXT: s_bfe_u32 s2, s0, 0x10018
+; GFX12-NEXT: s_and_b32 s7, s0, 1
+; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10011
+; GFX12-NEXT: s_bfe_u32 s0, s0, 0x10010
; GFX12-NEXT: v_and_b32_e32 v51, 1, v17
; GFX12-NEXT: v_dual_mov_b32 v54, v1 :: v_dual_and_b32 v53, 0xffff, v52
; GFX12-NEXT: v_and_b32_e32 v37, 0xffff, v5
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:144
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: v_mov_b32_e32 v2, s3
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:144
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: v_mov_b32_e32 v2, s1
; GFX12-NEXT: v_mov_b32_e32 v52, v1
-; GFX12-NEXT: global_store_b128 v1, v[41:44], s[0:1] offset:32
+; GFX12-NEXT: global_store_b128 v1, v[41:44], s[4:5] offset:32
; GFX12-NEXT: v_and_b32_e32 v41, 1, v49
; GFX12-NEXT: v_and_b32_e32 v43, 0xffff, v46
; GFX12-NEXT: v_mov_b32_e32 v13, v1
; GFX12-NEXT: v_and_b32_e32 v35, 0xffff, v56
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:128
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:128
; GFX12-NEXT: v_mov_b32_e32 v0, s7
; GFX12-NEXT: v_mov_b32_e32 v46, v1
; GFX12-NEXT: v_mov_b32_e32 v2, v37
; GFX12-NEXT: v_dual_mov_b32 v55, v1 :: v_dual_and_b32 v16, 1, v16
; GFX12-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_and_b32 v48, 1, v19
; GFX12-NEXT: v_and_b32_e32 v14, 0xffff, v14
-; GFX12-NEXT: global_store_b128 v1, v[51:54], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v1, v[51:54], s[4:5] offset:16
; GFX12-NEXT: v_dual_mov_b32 v53, v1 :: v_dual_and_b32 v52, 1, v21
; GFX12-NEXT: v_and_b32_e32 v54, 0xffff, v20
; GFX12-NEXT: v_dual_mov_b32 v17, v1 :: v_dual_and_b32 v50, 0xffff, v50
@@ -8608,40 +8608,40 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v56, 1, v28
; GFX12-NEXT: v_and_b32_e32 v58, 0xffff, v22
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
-; GFX12-NEXT: global_store_b128 v1, v[41:44], s[0:1] offset:496
-; GFX12-NEXT: global_store_b128 v1, v[52:55], s[0:1] offset:368
-; GFX12-NEXT: global_store_b128 v1, v[48:51], s[0:1] offset:352
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5]
+; GFX12-NEXT: global_store_b128 v1, v[41:44], s[4:5] offset:496
+; GFX12-NEXT: global_store_b128 v1, v[52:55], s[4:5] offset:368
+; GFX12-NEXT: global_store_b128 v1, v[48:51], s[4:5] offset:352
; GFX12-NEXT: v_mov_b32_e32 v41, v1
; GFX12-NEXT: v_dual_mov_b32 v39, v1 :: v_dual_mov_b32 v0, s6
; GFX12-NEXT: v_mov_b32_e32 v2, v36
; GFX12-NEXT: v_dual_mov_b32 v48, v1 :: v_dual_and_b32 v33, 0xffff, v33
; GFX12-NEXT: v_mov_b32_e32 v32, v1
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v1, v[56:59], s[0:1] offset:336
-; GFX12-NEXT: global_store_b128 v1, v[45:48], s[0:1] offset:320
-; GFX12-NEXT: global_store_b128 v1, v[38:41], s[0:1] offset:304
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:256
-; GFX12-NEXT: v_mov_b32_e32 v0, s5
+; GFX12-NEXT: global_store_b128 v1, v[56:59], s[4:5] offset:336
+; GFX12-NEXT: global_store_b128 v1, v[45:48], s[4:5] offset:320
+; GFX12-NEXT: global_store_b128 v1, v[38:41], s[4:5] offset:304
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:256
+; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: v_mov_b32_e32 v2, v30
; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_and_b32 v12, 1, v12
; GFX12-NEXT: v_mov_b32_e32 v15, v1
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v1, v[31:34], s[0:1] offset:272
-; GFX12-NEXT: global_store_b128 v1, v[23:26], s[0:1] offset:240
-; GFX12-NEXT: global_store_b128 v1, v[16:19], s[0:1] offset:480
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:448
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
+; GFX12-NEXT: global_store_b128 v1, v[31:34], s[4:5] offset:272
+; GFX12-NEXT: global_store_b128 v1, v[23:26], s[4:5] offset:240
+; GFX12-NEXT: global_store_b128 v1, v[16:19], s[4:5] offset:480
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:448
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: v_dual_mov_b32 v2, v35 :: v_dual_mov_b32 v9, v1
; GFX12-NEXT: v_mov_b32_e32 v11, v1
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v28, v1
; GFX12-NEXT: v_mov_b32_e32 v30, v1
; GFX12-NEXT: s_clause 0x4
-; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1] offset:464
-; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:224
-; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:208
-; GFX12-NEXT: global_store_b128 v1, v[27:30], s[0:1] offset:288
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:192
+; GFX12-NEXT: global_store_b128 v1, v[12:15], s[4:5] offset:464
+; GFX12-NEXT: global_store_b128 v1, v[8:11], s[4:5] offset:224
+; GFX12-NEXT: global_store_b128 v1, v[4:7], s[4:5] offset:208
+; GFX12-NEXT: global_store_b128 v1, v[27:30], s[4:5] offset:288
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:192
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -8977,13 +8977,13 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
;
; GFX8-LABEL: constant_sextload_v64i1_to_v64i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_load_dwordx2 s[10:11], s[6:7], 0x0
; GFX8-NEXT: s_mov_b32 s7, 0
; GFX8-NEXT: s_mov_b32 s13, s7
-; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx2 s[10:11], s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v29, s1
-; GFX8-NEXT: v_mov_b32_e32 v28, s0
+; GFX8-NEXT: v_mov_b32_e32 v29, s5
+; GFX8-NEXT: v_mov_b32_e32 v28, s4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshr_b32 s16, s11, 22
; GFX8-NEXT: s_lshr_b32 s18, s11, 23
@@ -9004,8 +9004,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: s_mov_b32 s6, s11
; GFX8-NEXT: s_lshr_b32 s12, s11, 24
; GFX8-NEXT: s_lshr_b32 s8, s10, 24
-; GFX8-NEXT: s_bfe_i64 s[2:3], s[8:9], 0x10000
-; GFX8-NEXT: s_bfe_i64 s[4:5], s[12:13], 0x10000
+; GFX8-NEXT: s_bfe_i64 s[0:1], s[8:9], 0x10000
+; GFX8-NEXT: s_bfe_i64 s[2:3], s[12:13], 0x10000
; GFX8-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
; GFX8-NEXT: s_bfe_i64 s[14:15], s[10:11], 0x10000
; GFX8-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000
@@ -9025,91 +9025,91 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000
; GFX8-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000
; GFX8-NEXT: v_mov_b32_e32 v22, s16
-; GFX8-NEXT: s_add_u32 s16, s0, 0x1b0
+; GFX8-NEXT: s_add_u32 s16, s4, 0x1b0
; GFX8-NEXT: v_mov_b32_e32 v23, s17
-; GFX8-NEXT: s_addc_u32 s17, s1, 0
+; GFX8-NEXT: s_addc_u32 s17, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v27, s17
; GFX8-NEXT: v_mov_b32_e32 v26, s16
-; GFX8-NEXT: s_add_u32 s16, s0, 0x1a0
+; GFX8-NEXT: s_add_u32 s16, s4, 0x1a0
; GFX8-NEXT: v_mov_b32_e32 v24, s18
; GFX8-NEXT: v_mov_b32_e32 v25, s19
-; GFX8-NEXT: s_addc_u32 s17, s1, 0
+; GFX8-NEXT: s_addc_u32 s17, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v27, s17
; GFX8-NEXT: v_mov_b32_e32 v26, s16
-; GFX8-NEXT: s_add_u32 s16, s0, 0x190
+; GFX8-NEXT: s_add_u32 s16, s4, 0x190
; GFX8-NEXT: v_mov_b32_e32 v22, s20
; GFX8-NEXT: v_mov_b32_e32 v23, s21
; GFX8-NEXT: v_mov_b32_e32 v24, s22
; GFX8-NEXT: v_mov_b32_e32 v25, s23
-; GFX8-NEXT: s_addc_u32 s17, s1, 0
+; GFX8-NEXT: s_addc_u32 s17, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v27, s17
; GFX8-NEXT: v_mov_b32_e32 v26, s16
-; GFX8-NEXT: s_add_u32 s16, s0, 0x180
+; GFX8-NEXT: s_add_u32 s16, s4, 0x180
; GFX8-NEXT: v_mov_b32_e32 v22, s24
; GFX8-NEXT: v_mov_b32_e32 v23, s25
; GFX8-NEXT: v_mov_b32_e32 v24, s26
; GFX8-NEXT: v_mov_b32_e32 v25, s27
-; GFX8-NEXT: s_addc_u32 s17, s1, 0
+; GFX8-NEXT: s_addc_u32 s17, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v27, s17
; GFX8-NEXT: v_mov_b32_e32 v26, s16
-; GFX8-NEXT: s_add_u32 s16, s0, 0xb0
+; GFX8-NEXT: s_add_u32 s16, s4, 0xb0
; GFX8-NEXT: v_mov_b32_e32 v22, s28
; GFX8-NEXT: v_mov_b32_e32 v23, s29
; GFX8-NEXT: v_mov_b32_e32 v24, s30
; GFX8-NEXT: v_mov_b32_e32 v25, s31
-; GFX8-NEXT: s_addc_u32 s17, s1, 0
+; GFX8-NEXT: s_addc_u32 s17, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v27, s17
; GFX8-NEXT: v_mov_b32_e32 v26, s16
-; GFX8-NEXT: s_add_u32 s16, s0, 0xa0
+; GFX8-NEXT: s_add_u32 s16, s4, 0xa0
; GFX8-NEXT: v_mov_b32_e32 v22, s34
; GFX8-NEXT: v_mov_b32_e32 v23, s35
; GFX8-NEXT: v_mov_b32_e32 v24, s36
; GFX8-NEXT: v_mov_b32_e32 v25, s37
-; GFX8-NEXT: s_addc_u32 s17, s1, 0
+; GFX8-NEXT: s_addc_u32 s17, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v27, s17
; GFX8-NEXT: v_mov_b32_e32 v26, s16
-; GFX8-NEXT: s_add_u32 s16, s0, 0x90
+; GFX8-NEXT: s_add_u32 s16, s4, 0x90
; GFX8-NEXT: v_mov_b32_e32 v22, s38
; GFX8-NEXT: v_mov_b32_e32 v23, s39
; GFX8-NEXT: v_mov_b32_e32 v24, s40
; GFX8-NEXT: v_mov_b32_e32 v25, s41
-; GFX8-NEXT: s_addc_u32 s17, s1, 0
+; GFX8-NEXT: s_addc_u32 s17, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v27, s17
; GFX8-NEXT: v_mov_b32_e32 v26, s16
-; GFX8-NEXT: s_add_u32 s16, s0, 0x80
+; GFX8-NEXT: s_add_u32 s16, s4, 0x80
; GFX8-NEXT: v_mov_b32_e32 v22, s42
; GFX8-NEXT: v_mov_b32_e32 v23, s43
; GFX8-NEXT: v_mov_b32_e32 v24, s44
; GFX8-NEXT: v_mov_b32_e32 v25, s45
-; GFX8-NEXT: s_addc_u32 s17, s1, 0
+; GFX8-NEXT: s_addc_u32 s17, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_mov_b32_e32 v27, s17
; GFX8-NEXT: v_mov_b32_e32 v26, s16
-; GFX8-NEXT: s_add_u32 s16, s0, 0x70
+; GFX8-NEXT: s_add_u32 s16, s4, 0x70
; GFX8-NEXT: v_lshrrev_b16_e64 v20, 14, s10
; GFX8-NEXT: v_lshrrev_b16_e64 v21, 15, s10
; GFX8-NEXT: v_mov_b32_e32 v22, s46
; GFX8-NEXT: v_mov_b32_e32 v23, s47
; GFX8-NEXT: v_mov_b32_e32 v24, s48
; GFX8-NEXT: v_mov_b32_e32 v25, s49
-; GFX8-NEXT: s_addc_u32 s17, s1, 0
+; GFX8-NEXT: s_addc_u32 s17, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[26:27], v[22:25]
; GFX8-NEXT: v_bfe_i32 v26, v21, 0, 1
; GFX8-NEXT: v_bfe_i32 v24, v20, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v21, s17
; GFX8-NEXT: v_mov_b32_e32 v20, s16
-; GFX8-NEXT: s_add_u32 s16, s0, 0x60
+; GFX8-NEXT: s_add_u32 s16, s4, 0x60
; GFX8-NEXT: v_lshrrev_b16_e64 v18, 12, s10
; GFX8-NEXT: v_lshrrev_b16_e64 v19, 13, s10
; GFX8-NEXT: v_ashrrev_i32_e32 v27, 31, v26
; GFX8-NEXT: v_ashrrev_i32_e32 v25, 31, v24
-; GFX8-NEXT: s_addc_u32 s17, s1, 0
+; GFX8-NEXT: s_addc_u32 s17, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[20:21], v[24:27]
; GFX8-NEXT: v_lshrrev_b16_e64 v16, 10, s10
; GFX8-NEXT: v_bfe_i32 v26, v19, 0, 1
@@ -9119,9 +9119,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_ashrrev_i32_e32 v27, 31, v26
; GFX8-NEXT: v_ashrrev_i32_e32 v25, 31, v24
; GFX8-NEXT: v_mov_b32_e32 v18, s16
-; GFX8-NEXT: s_add_u32 s16, s0, 0x50
+; GFX8-NEXT: s_add_u32 s16, s4, 0x50
; GFX8-NEXT: flat_store_dwordx4 v[18:19], v[24:27]
-; GFX8-NEXT: s_addc_u32 s17, s1, 0
+; GFX8-NEXT: s_addc_u32 s17, s5, 0
; GFX8-NEXT: v_bfe_i32 v26, v17, 0, 1
; GFX8-NEXT: v_bfe_i32 v24, v16, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v16, s16
@@ -9137,7 +9137,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_ashrrev_i32_e32 v27, 31, v26
; GFX8-NEXT: v_ashrrev_i32_e32 v25, 31, v24
; GFX8-NEXT: v_mov_b32_e32 v17, s17
-; GFX8-NEXT: s_add_u32 s10, s0, 64
+; GFX8-NEXT: s_add_u32 s10, s4, 64
; GFX8-NEXT: v_lshrrev_b16_e64 v5, 14, s11
; GFX8-NEXT: v_lshrrev_b16_e64 v6, 15, s11
; GFX8-NEXT: v_lshrrev_b16_e64 v3, 12, s11
@@ -9154,15 +9154,15 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: flat_store_dwordx4 v[16:17], v[24:27]
; GFX8-NEXT: v_lshrrev_b16_e64 v17, 3, s11
; GFX8-NEXT: v_lshrrev_b16_e64 v16, 1, s11
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: v_bfe_i32 v26, v15, 0, 1
; GFX8-NEXT: v_bfe_i32 v24, v14, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v15, s11
; GFX8-NEXT: v_mov_b32_e32 v14, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 48
+; GFX8-NEXT: s_add_u32 s10, s4, 48
; GFX8-NEXT: v_ashrrev_i32_e32 v27, 31, v26
; GFX8-NEXT: v_ashrrev_i32_e32 v25, 31, v24
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[14:15], v[24:27]
; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 1
; GFX8-NEXT: v_bfe_i32 v26, v13, 0, 1
@@ -9171,18 +9171,18 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_ashrrev_i32_e32 v27, 31, v26
; GFX8-NEXT: v_ashrrev_i32_e32 v25, 31, v24
; GFX8-NEXT: v_mov_b32_e32 v12, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 32
+; GFX8-NEXT: s_add_u32 s10, s4, 32
; GFX8-NEXT: flat_store_dwordx4 v[12:13], v[24:27]
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: v_bfe_i32 v26, v11, 0, 1
; GFX8-NEXT: v_bfe_i32 v24, v10, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v10, s10
; GFX8-NEXT: v_ashrrev_i32_e32 v27, 31, v26
; GFX8-NEXT: v_ashrrev_i32_e32 v25, 31, v24
; GFX8-NEXT: v_mov_b32_e32 v11, s11
-; GFX8-NEXT: s_add_u32 s10, s0, 16
+; GFX8-NEXT: s_add_u32 s10, s4, 16
; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[24:27]
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: v_bfe_i32 v26, v9, 0, 1
; GFX8-NEXT: v_bfe_i32 v24, v8, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v8, s10
@@ -9190,32 +9190,32 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_ashrrev_i32_e32 v25, 31, v24
; GFX8-NEXT: v_mov_b32_e32 v9, s11
; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[24:27]
-; GFX8-NEXT: s_add_u32 s10, s0, 0x170
+; GFX8-NEXT: s_add_u32 s10, s4, 0x170
; GFX8-NEXT: v_bfe_i32 v26, v7, 0, 1
; GFX8-NEXT: v_ashrrev_i32_e32 v27, 31, v26
; GFX8-NEXT: v_mov_b32_e32 v24, s14
; GFX8-NEXT: v_mov_b32_e32 v25, s15
; GFX8-NEXT: flat_store_dwordx4 v[28:29], v[24:27]
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: v_bfe_i32 v26, v6, 0, 1
; GFX8-NEXT: v_bfe_i32 v24, v5, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v5, s10
; GFX8-NEXT: v_mov_b32_e32 v6, s11
-; GFX8-NEXT: s_add_u32 s10, s0, 0x160
+; GFX8-NEXT: s_add_u32 s10, s4, 0x160
; GFX8-NEXT: v_ashrrev_i32_e32 v27, 31, v26
; GFX8-NEXT: v_ashrrev_i32_e32 v25, 31, v24
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: flat_store_dwordx4 v[5:6], v[24:27]
; GFX8-NEXT: v_bfe_i32 v5, v4, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v25, s11
; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX8-NEXT: v_mov_b32_e32 v24, s10
-; GFX8-NEXT: s_add_u32 s10, s0, 0x150
+; GFX8-NEXT: s_add_u32 s10, s4, 0x150
; GFX8-NEXT: flat_store_dwordx4 v[24:25], v[3:6]
; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 1
; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 1
-; GFX8-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v5, s10
; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1
@@ -9228,39 +9228,39 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_lshrrev_b16_e64 v25, 3, s8
; GFX8-NEXT: flat_store_dwordx4 v[5:6], v[1:4]
; GFX8-NEXT: v_lshrrev_b16_e64 v6, 1, s8
-; GFX8-NEXT: s_add_u32 s8, s0, 0x140
+; GFX8-NEXT: s_add_u32 s8, s4, 0x140
; GFX8-NEXT: v_bfe_i32 v2, v23, 0, 1
; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 1
-; GFX8-NEXT: s_addc_u32 s9, s1, 0
+; GFX8-NEXT: s_addc_u32 s9, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v4, s8
; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX8-NEXT: v_mov_b32_e32 v5, s9
-; GFX8-NEXT: s_add_u32 s8, s0, 0x130
+; GFX8-NEXT: s_add_u32 s8, s4, 0x130
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: v_bfe_i32 v4, v22, 0, 1
; GFX8-NEXT: v_bfe_i32 v2, v6, 0, 1
; GFX8-NEXT: v_bfe_i32 v6, v21, 0, 1
-; GFX8-NEXT: s_addc_u32 s9, s1, 0
+; GFX8-NEXT: s_addc_u32 s9, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v6
; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GFX8-NEXT: v_mov_b32_e32 v1, s9
-; GFX8-NEXT: s_add_u32 s8, s0, 0x120
+; GFX8-NEXT: s_add_u32 s8, s4, 0x120
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX8-NEXT: v_bfe_i32 v21, v19, 0, 1
; GFX8-NEXT: v_bfe_i32 v19, v20, 0, 1
-; GFX8-NEXT: s_addc_u32 s9, s1, 0
+; GFX8-NEXT: s_addc_u32 s9, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: v_ashrrev_i32_e32 v22, 31, v21
; GFX8-NEXT: v_ashrrev_i32_e32 v20, 31, v19
; GFX8-NEXT: v_mov_b32_e32 v1, s9
-; GFX8-NEXT: s_add_u32 s8, s0, 0x110
+; GFX8-NEXT: s_add_u32 s8, s4, 0x110
; GFX8-NEXT: v_bfe_i32 v6, v25, 0, 1
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[19:22]
; GFX8-NEXT: v_bfe_i32 v25, v17, 0, 1
; GFX8-NEXT: v_bfe_i32 v23, v18, 0, 1
-; GFX8-NEXT: s_addc_u32 s9, s1, 0
+; GFX8-NEXT: s_addc_u32 s9, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: v_bfe_i32 v4, v24, 0, 1
; GFX8-NEXT: v_bfe_i32 v19, v26, 0, 1
@@ -9270,31 +9270,31 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[23:26]
; GFX8-NEXT: v_lshrrev_b16_e64 v14, 6, s12
; GFX8-NEXT: v_mov_b32_e32 v23, s6
-; GFX8-NEXT: s_add_u32 s6, s0, 0x100
+; GFX8-NEXT: s_add_u32 s6, s4, 0x100
; GFX8-NEXT: v_bfe_i32 v25, v16, 0, 1
; GFX8-NEXT: v_mov_b32_e32 v24, s7
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_lshrrev_b16_e64 v15, 7, s12
; GFX8-NEXT: v_ashrrev_i32_e32 v26, 31, v25
; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: s_add_u32 s6, s0, 0x1f0
+; GFX8-NEXT: s_add_u32 s6, s4, 0x1f0
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[23:26]
; GFX8-NEXT: v_bfe_i32 v16, v15, 0, 1
; GFX8-NEXT: v_bfe_i32 v14, v14, 0, 1
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_lshrrev_b16_e64 v12, 4, s12
; GFX8-NEXT: v_lshrrev_b16_e64 v13, 5, s12
; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v16
; GFX8-NEXT: v_ashrrev_i32_e32 v15, 31, v14
; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: s_add_u32 s6, s0, 0x1e0
+; GFX8-NEXT: s_add_u32 s6, s4, 0x1e0
; GFX8-NEXT: v_bfe_i32 v21, v27, 0, 1
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[14:17]
; GFX8-NEXT: v_bfe_i32 v29, v13, 0, 1
; GFX8-NEXT: v_bfe_i32 v27, v12, 0, 1
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_lshrrev_b16_e64 v10, 2, s12
; GFX8-NEXT: v_lshrrev_b16_e64 v11, 3, s12
@@ -9302,12 +9302,12 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_ashrrev_i32_e32 v30, 31, v29
; GFX8-NEXT: v_ashrrev_i32_e32 v28, 31, v27
; GFX8-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NEXT: s_add_u32 s6, s0, 0x1d0
+; GFX8-NEXT: s_add_u32 s6, s4, 0x1d0
; GFX8-NEXT: v_bfe_i32 v23, v9, 0, 1
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[27:30]
; GFX8-NEXT: v_bfe_i32 v11, v11, 0, 1
; GFX8-NEXT: v_bfe_i32 v9, v10, 0, 1
-; GFX8-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v11
; GFX8-NEXT: v_ashrrev_i32_e32 v10, 31, v9
@@ -9315,41 +9315,41 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX8-NEXT: v_lshrrev_b16_e64 v8, 1, s12
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[9:12]
; GFX8-NEXT: v_bfe_i32 v14, v8, 0, 1
-; GFX8-NEXT: v_mov_b32_e32 v12, s4
-; GFX8-NEXT: s_add_u32 s4, s0, 0x1c0
-; GFX8-NEXT: v_mov_b32_e32 v13, s5
-; GFX8-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v12, s2
+; GFX8-NEXT: s_add_u32 s2, s4, 0x1c0
+; GFX8-NEXT: v_mov_b32_e32 v13, s3
+; GFX8-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_ashrrev_i32_e32 v15, 31, v14
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_add_u32 s4, s0, 0xf0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: s_add_u32 s2, s4, 0xf0
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
-; GFX8-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_ashrrev_i32_e32 v26, 31, v25
; GFX8-NEXT: v_ashrrev_i32_e32 v24, 31, v23
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_add_u32 s4, s0, 0xe0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: s_add_u32 s2, s4, 0xe0
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[23:26]
-; GFX8-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_add_u32 s4, s0, 0xd0
+; GFX8-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_ashrrev_i32_e32 v22, 31, v21
; GFX8-NEXT: v_ashrrev_i32_e32 v20, 31, v19
-; GFX8-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: s_add_u32 s2, s4, 0xd0
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[19:22]
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: s_add_u32 s0, s0, 0xc0
+; GFX8-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v6
; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
-; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: s_add_u32 s0, s4, 0xc0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
@@ -9724,115 +9724,115 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_sextload_v64i1_to_v64i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: s_mov_b32 s5, 0
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s3, 0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_mov_b32 s19, s5
+; GFX12-NEXT: s_mov_b32 s19, s3
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s26, s3, 22
-; GFX12-NEXT: s_lshr_b32 s28, s3, 23
-; GFX12-NEXT: s_lshr_b32 s30, s3, 20
-; GFX12-NEXT: s_lshr_b32 s34, s3, 21
+; GFX12-NEXT: s_lshr_b32 s26, s1, 22
+; GFX12-NEXT: s_lshr_b32 s28, s1, 23
+; GFX12-NEXT: s_lshr_b32 s30, s1, 20
+; GFX12-NEXT: s_lshr_b32 s34, s1, 21
; GFX12-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000
; GFX12-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000
-; GFX12-NEXT: s_lshr_b32 s20, s3, 18
+; GFX12-NEXT: s_lshr_b32 s20, s1, 18
; GFX12-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000
; GFX12-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000
; GFX12-NEXT: v_dual_mov_b32 v12, 0 :: v_dual_mov_b32 v43, s27
; GFX12-NEXT: v_dual_mov_b32 v42, s26 :: v_dual_mov_b32 v45, s29
; GFX12-NEXT: v_dual_mov_b32 v44, s28 :: v_dual_mov_b32 v47, s31
-; GFX12-NEXT: s_lshr_b32 s22, s3, 19
+; GFX12-NEXT: s_lshr_b32 s22, s1, 19
; GFX12-NEXT: v_dual_mov_b32 v46, s30 :: v_dual_mov_b32 v49, s35
; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000
; GFX12-NEXT: v_mov_b32_e32 v48, s34
-; GFX12-NEXT: s_lshr_b32 s24, s3, 16
-; GFX12-NEXT: s_lshr_b32 s36, s3, 17
+; GFX12-NEXT: s_lshr_b32 s24, s1, 16
+; GFX12-NEXT: s_lshr_b32 s36, s1, 17
; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000
-; GFX12-NEXT: s_lshr_b32 s12, s2, 22
+; GFX12-NEXT: s_lshr_b32 s12, s0, 22
; GFX12-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000
; GFX12-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v12, v[42:45], s[0:1] offset:432
-; GFX12-NEXT: global_store_b128 v12, v[46:49], s[0:1] offset:416
+; GFX12-NEXT: global_store_b128 v12, v[42:45], s[4:5] offset:432
+; GFX12-NEXT: global_store_b128 v12, v[46:49], s[4:5] offset:416
; GFX12-NEXT: v_dual_mov_b32 v43, s21 :: v_dual_mov_b32 v42, s20
; GFX12-NEXT: v_dual_mov_b32 v45, s23 :: v_dual_mov_b32 v44, s22
; GFX12-NEXT: v_mov_b32_e32 v47, s25
-; GFX12-NEXT: s_lshr_b32 s14, s2, 23
+; GFX12-NEXT: s_lshr_b32 s14, s0, 23
; GFX12-NEXT: v_dual_mov_b32 v46, s24 :: v_dual_mov_b32 v49, s37
; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000
; GFX12-NEXT: v_mov_b32_e32 v48, s36
-; GFX12-NEXT: s_lshr_b32 s16, s2, 20
-; GFX12-NEXT: s_lshr_b32 s40, s2, 21
+; GFX12-NEXT: s_lshr_b32 s16, s0, 20
+; GFX12-NEXT: s_lshr_b32 s40, s0, 21
; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000
-; GFX12-NEXT: s_lshr_b32 s6, s2, 18
+; GFX12-NEXT: s_lshr_b32 s6, s0, 18
; GFX12-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000
; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v12, v[42:45], s[0:1] offset:400
-; GFX12-NEXT: global_store_b128 v12, v[46:49], s[0:1] offset:384
+; GFX12-NEXT: global_store_b128 v12, v[42:45], s[4:5] offset:400
+; GFX12-NEXT: global_store_b128 v12, v[46:49], s[4:5] offset:384
; GFX12-NEXT: v_dual_mov_b32 v43, s13 :: v_dual_mov_b32 v42, s12
; GFX12-NEXT: v_dual_mov_b32 v45, s15 :: v_dual_mov_b32 v44, s14
; GFX12-NEXT: v_mov_b32_e32 v47, s17
-; GFX12-NEXT: s_lshr_b32 s8, s2, 19
+; GFX12-NEXT: s_lshr_b32 s8, s0, 19
; GFX12-NEXT: v_dual_mov_b32 v46, s16 :: v_dual_mov_b32 v49, s41
; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
; GFX12-NEXT: v_mov_b32_e32 v48, s40
-; GFX12-NEXT: s_lshr_b32 s10, s2, 16
+; GFX12-NEXT: s_lshr_b32 s10, s0, 16
; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000
-; GFX12-NEXT: v_lshrrev_b16 v3, 14, s2
-; GFX12-NEXT: v_lshrrev_b16 v5, 15, s2
-; GFX12-NEXT: v_lshrrev_b16 v7, 12, s2
-; GFX12-NEXT: v_lshrrev_b16 v9, 13, s2
+; GFX12-NEXT: v_lshrrev_b16 v3, 14, s0
+; GFX12-NEXT: v_lshrrev_b16 v5, 15, s0
+; GFX12-NEXT: v_lshrrev_b16 v7, 12, s0
+; GFX12-NEXT: v_lshrrev_b16 v9, 13, s0
; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v12, v[42:45], s[0:1] offset:176
-; GFX12-NEXT: global_store_b128 v12, v[46:49], s[0:1] offset:160
+; GFX12-NEXT: global_store_b128 v12, v[42:45], s[4:5] offset:176
+; GFX12-NEXT: global_store_b128 v12, v[46:49], s[4:5] offset:160
; GFX12-NEXT: v_dual_mov_b32 v43, s7 :: v_dual_mov_b32 v42, s6
; GFX12-NEXT: v_dual_mov_b32 v45, s9 :: v_dual_mov_b32 v44, s8
; GFX12-NEXT: v_mov_b32_e32 v47, s11
-; GFX12-NEXT: s_lshr_b32 s42, s2, 17
-; GFX12-NEXT: v_lshrrev_b16 v32, 10, s2
+; GFX12-NEXT: s_lshr_b32 s42, s0, 17
+; GFX12-NEXT: v_lshrrev_b16 v32, 10, s0
; GFX12-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000
-; GFX12-NEXT: v_lshrrev_b16 v34, 11, s2
-; GFX12-NEXT: v_lshrrev_b16 v33, 8, s2
-; GFX12-NEXT: v_lshrrev_b16 v35, 9, s2
-; GFX12-NEXT: v_lshrrev_b16 v27, 6, s2
-; GFX12-NEXT: v_lshrrev_b16 v29, 7, s2
-; GFX12-NEXT: v_lshrrev_b16 v30, 4, s2
-; GFX12-NEXT: v_lshrrev_b16 v31, 5, s2
-; GFX12-NEXT: v_lshrrev_b16 v24, 2, s2
-; GFX12-NEXT: v_lshrrev_b16 v25, 3, s2
-; GFX12-NEXT: v_lshrrev_b16 v23, 1, s2
-; GFX12-NEXT: v_lshrrev_b16 v18, 14, s3
-; GFX12-NEXT: v_lshrrev_b16 v20, 15, s3
-; GFX12-NEXT: v_lshrrev_b16 v16, 12, s3
-; GFX12-NEXT: v_lshrrev_b16 v19, 13, s3
-; GFX12-NEXT: v_lshrrev_b16 v0, 10, s3
-; GFX12-NEXT: v_lshrrev_b16 v1, 11, s3
-; GFX12-NEXT: v_lshrrev_b16 v13, 8, s3
-; GFX12-NEXT: v_lshrrev_b16 v15, 9, s3
-; GFX12-NEXT: v_lshrrev_b16 v14, 6, s3
-; GFX12-NEXT: v_lshrrev_b16 v17, 7, s3
-; GFX12-NEXT: v_lshrrev_b16 v21, 4, s3
-; GFX12-NEXT: v_lshrrev_b16 v22, 5, s3
-; GFX12-NEXT: v_lshrrev_b16 v26, 2, s3
-; GFX12-NEXT: v_lshrrev_b16 v28, 3, s3
-; GFX12-NEXT: v_lshrrev_b16 v36, 1, s3
-; GFX12-NEXT: s_lshr_b32 s18, s3, 24
-; GFX12-NEXT: s_mov_b32 s4, s3
-; GFX12-NEXT: s_lshr_b32 s38, s2, 24
+; GFX12-NEXT: v_lshrrev_b16 v34, 11, s0
+; GFX12-NEXT: v_lshrrev_b16 v33, 8, s0
+; GFX12-NEXT: v_lshrrev_b16 v35, 9, s0
+; GFX12-NEXT: v_lshrrev_b16 v27, 6, s0
+; GFX12-NEXT: v_lshrrev_b16 v29, 7, s0
+; GFX12-NEXT: v_lshrrev_b16 v30, 4, s0
+; GFX12-NEXT: v_lshrrev_b16 v31, 5, s0
+; GFX12-NEXT: v_lshrrev_b16 v24, 2, s0
+; GFX12-NEXT: v_lshrrev_b16 v25, 3, s0
+; GFX12-NEXT: v_lshrrev_b16 v23, 1, s0
+; GFX12-NEXT: v_lshrrev_b16 v18, 14, s1
+; GFX12-NEXT: v_lshrrev_b16 v20, 15, s1
+; GFX12-NEXT: v_lshrrev_b16 v16, 12, s1
+; GFX12-NEXT: v_lshrrev_b16 v19, 13, s1
+; GFX12-NEXT: v_lshrrev_b16 v0, 10, s1
+; GFX12-NEXT: v_lshrrev_b16 v1, 11, s1
+; GFX12-NEXT: v_lshrrev_b16 v13, 8, s1
+; GFX12-NEXT: v_lshrrev_b16 v15, 9, s1
+; GFX12-NEXT: v_lshrrev_b16 v14, 6, s1
+; GFX12-NEXT: v_lshrrev_b16 v17, 7, s1
+; GFX12-NEXT: v_lshrrev_b16 v21, 4, s1
+; GFX12-NEXT: v_lshrrev_b16 v22, 5, s1
+; GFX12-NEXT: v_lshrrev_b16 v26, 2, s1
+; GFX12-NEXT: v_lshrrev_b16 v28, 3, s1
+; GFX12-NEXT: v_lshrrev_b16 v36, 1, s1
+; GFX12-NEXT: s_lshr_b32 s18, s1, 24
+; GFX12-NEXT: s_mov_b32 s2, s1
+; GFX12-NEXT: s_lshr_b32 s38, s0, 24
; GFX12-NEXT: v_dual_mov_b32 v46, s10 :: v_dual_mov_b32 v49, s43
; GFX12-NEXT: v_bfe_i32 v52, v5, 0, 1
; GFX12-NEXT: v_bfe_i32 v50, v3, 0, 1
-; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX12-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000
; GFX12-NEXT: v_mov_b32_e32 v48, s42
-; GFX12-NEXT: global_store_b128 v12, v[42:45], s[0:1] offset:144
+; GFX12-NEXT: global_store_b128 v12, v[42:45], s[4:5] offset:144
; GFX12-NEXT: v_bfe_i32 v44, v9, 0, 1
; GFX12-NEXT: v_bfe_i32 v42, v7, 0, 1
; GFX12-NEXT: v_lshrrev_b16 v41, 2, s18
-; GFX12-NEXT: global_store_b128 v12, v[46:49], s[0:1] offset:128
+; GFX12-NEXT: global_store_b128 v12, v[46:49], s[4:5] offset:128
; GFX12-NEXT: v_lshrrev_b16 v54, 3, s18
; GFX12-NEXT: v_lshrrev_b16 v56, 6, s38
; GFX12-NEXT: v_ashrrev_i32_e32 v53, 31, v52
@@ -9841,9 +9841,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_ashrrev_i32_e32 v43, 31, v42
; GFX12-NEXT: v_bfe_i32 v46, v56, 0, 1
; GFX12-NEXT: v_bfe_i32 v56, v54, 0, 1
-; GFX12-NEXT: global_store_b128 v12, v[50:53], s[0:1] offset:112
+; GFX12-NEXT: global_store_b128 v12, v[50:53], s[4:5] offset:112
; GFX12-NEXT: v_bfe_i32 v34, v34, 0, 1
-; GFX12-NEXT: global_store_b128 v12, v[42:45], s[0:1] offset:96
+; GFX12-NEXT: global_store_b128 v12, v[42:45], s[4:5] offset:96
; GFX12-NEXT: v_bfe_i32 v32, v32, 0, 1
; GFX12-NEXT: v_bfe_i32 v54, v41, 0, 1
; GFX12-NEXT: v_bfe_i32 v43, v35, 0, 1
@@ -9855,9 +9855,9 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_ashrrev_i32_e32 v42, 31, v41
; GFX12-NEXT: v_lshrrev_b16 v40, 5, s18
; GFX12-NEXT: v_lshrrev_b16 v37, 6, s18
-; GFX12-NEXT: global_store_b128 v12, v[32:35], s[0:1] offset:80
+; GFX12-NEXT: global_store_b128 v12, v[32:35], s[4:5] offset:80
; GFX12-NEXT: v_bfe_i32 v32, v39, 0, 1
-; GFX12-NEXT: global_store_b128 v12, v[41:44], s[0:1] offset:64
+; GFX12-NEXT: global_store_b128 v12, v[41:44], s[4:5] offset:64
; GFX12-NEXT: v_bfe_i32 v41, v29, 0, 1
; GFX12-NEXT: v_bfe_i32 v39, v27, 0, 1
; GFX12-NEXT: v_bfe_i32 v34, v40, 0, 1
@@ -9869,23 +9869,23 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_bfe_i32 v62, v37, 0, 1
; GFX12-NEXT: v_ashrrev_i32_e32 v61, 31, v60
; GFX12-NEXT: v_ashrrev_i32_e32 v59, 31, v58
-; GFX12-NEXT: global_store_b128 v12, v[39:42], s[0:1] offset:48
+; GFX12-NEXT: global_store_b128 v12, v[39:42], s[4:5] offset:48
; GFX12-NEXT: v_bfe_i32 v39, v25, 0, 1
; GFX12-NEXT: v_bfe_i32 v37, v24, 0, 1
; GFX12-NEXT: v_bfe_i32 v64, v38, 0, 1
-; GFX12-NEXT: global_store_b128 v12, v[58:61], s[0:1] offset:32
+; GFX12-NEXT: global_store_b128 v12, v[58:61], s[4:5] offset:32
; GFX12-NEXT: v_bfe_i32 v43, v23, 0, 1
; GFX12-NEXT: v_ashrrev_i32_e32 v40, 31, v39
; GFX12-NEXT: v_ashrrev_i32_e32 v38, 31, v37
-; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
; GFX12-NEXT: v_bfe_i32 v24, v36, 0, 1
; GFX12-NEXT: v_ashrrev_i32_e32 v44, 31, v43
-; GFX12-NEXT: v_dual_mov_b32 v41, s2 :: v_dual_mov_b32 v42, s3
-; GFX12-NEXT: v_mov_b32_e32 v23, s5
-; GFX12-NEXT: global_store_b128 v12, v[37:40], s[0:1] offset:16
+; GFX12-NEXT: v_dual_mov_b32 v41, s0 :: v_dual_mov_b32 v42, s1
+; GFX12-NEXT: v_mov_b32_e32 v23, s3
+; GFX12-NEXT: global_store_b128 v12, v[37:40], s[4:5] offset:16
; GFX12-NEXT: v_bfe_i32 v38, v20, 0, 1
; GFX12-NEXT: v_bfe_i32 v36, v18, 0, 1
-; GFX12-NEXT: global_store_b128 v12, v[41:44], s[0:1]
+; GFX12-NEXT: global_store_b128 v12, v[41:44], s[4:5]
; GFX12-NEXT: v_bfe_i32 v20, v19, 0, 1
; GFX12-NEXT: v_bfe_i32 v18, v16, 0, 1
; GFX12-NEXT: v_ashrrev_i32_e32 v39, 31, v38
@@ -9901,8 +9901,8 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_lshrrev_b16 v11, 4, s38
; GFX12-NEXT: v_lshrrev_b16 v2, 1, s38
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v12, v[36:39], s[0:1] offset:368
-; GFX12-NEXT: global_store_b128 v12, v[18:21], s[0:1] offset:352
+; GFX12-NEXT: global_store_b128 v12, v[36:39], s[4:5] offset:368
+; GFX12-NEXT: global_store_b128 v12, v[18:21], s[4:5] offset:352
; GFX12-NEXT: v_bfe_i32 v38, v1, 0, 1
; GFX12-NEXT: v_bfe_i32 v36, v0, 0, 1
; GFX12-NEXT: v_bfe_i32 v52, v55, 0, 1
@@ -9932,7 +9932,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_ashrrev_i32_e32 v33, 31, v32
; GFX12-NEXT: v_ashrrev_i32_e32 v21, 31, v20
; GFX12-NEXT: v_ashrrev_i32_e32 v19, 31, v18
-; GFX12-NEXT: v_dual_mov_b32 v22, s4 :: v_dual_mov_b32 v51, s9
+; GFX12-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v51, s9
; GFX12-NEXT: v_dual_mov_b32 v50, s8 :: v_dual_mov_b32 v1, s7
; GFX12-NEXT: v_ashrrev_i32_e32 v49, 31, v48
; GFX12-NEXT: v_ashrrev_i32_e32 v47, 31, v46
@@ -9949,22 +9949,22 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX12-NEXT: v_ashrrev_i32_e32 v25, 31, v24
; GFX12-NEXT: s_clause 0x7
-; GFX12-NEXT: global_store_b128 v12, v[36:39], s[0:1] offset:336
-; GFX12-NEXT: global_store_b128 v12, v[18:21], s[0:1] offset:320
-; GFX12-NEXT: global_store_b128 v12, v[14:17], s[0:1] offset:304
-; GFX12-NEXT: global_store_b128 v12, v[40:43], s[0:1] offset:288
-; GFX12-NEXT: global_store_b128 v12, v[26:29], s[0:1] offset:272
-; GFX12-NEXT: global_store_b128 v12, v[22:25], s[0:1] offset:256
-; GFX12-NEXT: global_store_b128 v12, v[62:65], s[0:1] offset:496
-; GFX12-NEXT: global_store_b128 v12, v[32:35], s[0:1] offset:480
+; GFX12-NEXT: global_store_b128 v12, v[36:39], s[4:5] offset:336
+; GFX12-NEXT: global_store_b128 v12, v[18:21], s[4:5] offset:320
+; GFX12-NEXT: global_store_b128 v12, v[14:17], s[4:5] offset:304
+; GFX12-NEXT: global_store_b128 v12, v[40:43], s[4:5] offset:288
+; GFX12-NEXT: global_store_b128 v12, v[26:29], s[4:5] offset:272
+; GFX12-NEXT: global_store_b128 v12, v[22:25], s[4:5] offset:256
+; GFX12-NEXT: global_store_b128 v12, v[62:65], s[4:5] offset:496
+; GFX12-NEXT: global_store_b128 v12, v[32:35], s[4:5] offset:480
; GFX12-NEXT: v_mov_b32_e32 v0, s6
; GFX12-NEXT: s_clause 0x5
-; GFX12-NEXT: global_store_b128 v12, v[54:57], s[0:1] offset:464
-; GFX12-NEXT: global_store_b128 v12, v[50:53], s[0:1] offset:448
-; GFX12-NEXT: global_store_b128 v12, v[46:49], s[0:1] offset:240
-; GFX12-NEXT: global_store_b128 v12, v[8:11], s[0:1] offset:224
-; GFX12-NEXT: global_store_b128 v12, v[4:7], s[0:1] offset:208
-; GFX12-NEXT: global_store_b128 v12, v[0:3], s[0:1] offset:192
+; GFX12-NEXT: global_store_b128 v12, v[54:57], s[4:5] offset:464
+; GFX12-NEXT: global_store_b128 v12, v[50:53], s[4:5] offset:448
+; GFX12-NEXT: global_store_b128 v12, v[46:49], s[4:5] offset:240
+; GFX12-NEXT: global_store_b128 v12, v[8:11], s[4:5] offset:224
+; GFX12-NEXT: global_store_b128 v12, v[4:7], s[4:5] offset:208
+; GFX12-NEXT: global_store_b128 v12, v[0:3], s[4:5] offset:192
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index a87fa8bf36d9e..a5ca228a4c092 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -38,13 +38,13 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspac
;
; GCN-NOHSA-VI-LABEL: constant_load_i16:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
; GCN-NOHSA-VI-NEXT: flat_load_ushort v2, v[0:1]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: flat_store_short v[0:1], v2
; GCN-NOHSA-VI-NEXT: s_endpgm
@@ -77,12 +77,12 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspac
;
; GFX12-LABEL: constant_load_i16:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -119,13 +119,13 @@ define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrsp
;
; GCN-NOHSA-VI-LABEL: constant_load_v2i16:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
; GCN-NOHSA-VI-NEXT: flat_store_dword v[0:1], v2
; GCN-NOHSA-VI-NEXT: s_endpgm
;
@@ -147,12 +147,12 @@ define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrsp
;
; GFX12-LABEL: constant_load_v2i16:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -198,18 +198,18 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp
;
; GCN-NOHSA-VI-LABEL: constant_load_v3i16:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT: s_add_u32 s4, s0, 4
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s5, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5
+; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s4, 4
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s5, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s0
; GCN-NOHSA-VI-NEXT: flat_store_short v[2:3], v4
; GCN-NOHSA-VI-NEXT: flat_store_dword v[0:1], v5
; GCN-NOHSA-VI-NEXT: s_endpgm
@@ -252,15 +252,15 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp
;
; GFX12-LABEL: constant_load_v3i16:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] offset:4
-; GFX12-NEXT: global_store_b32 v0, v2, s[0:1]
+; GFX12-NEXT: global_store_b16 v0, v1, s[4:5] offset:4
+; GFX12-NEXT: global_store_b32 v0, v2, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -299,14 +299,14 @@ define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrsp
;
; GCN-NOHSA-VI-LABEL: constant_load_v4i16:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
@@ -328,13 +328,13 @@ define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrsp
;
; GFX12-LABEL: constant_load_v4i16:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -377,16 +377,16 @@ define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrsp
;
; GCN-NOHSA-VI-LABEL: constant_load_v8i16:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
@@ -408,14 +408,14 @@ define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrsp
;
; GFX12-LABEL: constant_load_v8i16:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
-; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -608,41 +608,41 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
;
; GCN-NOHSA-VI-LABEL: constant_load_v16i16_align2:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 14
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 12
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 10
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 8
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 6
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 4
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 30
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 28
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 26
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 14
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 12
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 10
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 8
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 6
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 4
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 30
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 28
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 26
; GCN-NOHSA-VI-NEXT: flat_load_ushort v16, v[0:1]
; GCN-NOHSA-VI-NEXT: flat_load_ushort v17, v[2:3]
; GCN-NOHSA-VI-NEXT: flat_load_ushort v18, v[4:5]
@@ -651,35 +651,35 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
; GCN-NOHSA-VI-NEXT: flat_load_ushort v21, v[10:11]
; GCN-NOHSA-VI-NEXT: flat_load_ushort v22, v[12:13]
; GCN-NOHSA-VI-NEXT: flat_load_ushort v23, v[14:15]
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 24
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 22
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 20
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 18
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 2
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s0
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 24
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 22
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 20
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 18
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 16
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s2, 2
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s3, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s2
; GCN-NOHSA-VI-NEXT: flat_load_ushort v0, v[0:1]
; GCN-NOHSA-VI-NEXT: flat_load_ushort v24, v[2:3]
; GCN-NOHSA-VI-NEXT: flat_load_ushort v4, v[4:5]
@@ -742,26 +742,26 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
;
; GFX12-LABEL: constant_load_v16i16_align2:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v8, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_clause 0xf
-; GFX12-NEXT: global_load_u16 v3, v8, s[0:1] offset:28
-; GFX12-NEXT: global_load_u16 v2, v8, s[0:1] offset:24
-; GFX12-NEXT: global_load_u16 v1, v8, s[0:1] offset:20
-; GFX12-NEXT: global_load_u16 v0, v8, s[0:1] offset:16
-; GFX12-NEXT: global_load_u16 v7, v8, s[0:1] offset:12
-; GFX12-NEXT: global_load_u16 v6, v8, s[0:1] offset:8
-; GFX12-NEXT: global_load_u16 v5, v8, s[0:1] offset:4
-; GFX12-NEXT: global_load_u16 v4, v8, s[0:1]
-; GFX12-NEXT: global_load_d16_hi_b16 v3, v8, s[0:1] offset:30
-; GFX12-NEXT: global_load_d16_hi_b16 v2, v8, s[0:1] offset:26
-; GFX12-NEXT: global_load_d16_hi_b16 v1, v8, s[0:1] offset:22
-; GFX12-NEXT: global_load_d16_hi_b16 v0, v8, s[0:1] offset:18
-; GFX12-NEXT: global_load_d16_hi_b16 v7, v8, s[0:1] offset:14
-; GFX12-NEXT: global_load_d16_hi_b16 v6, v8, s[0:1] offset:10
-; GFX12-NEXT: global_load_d16_hi_b16 v5, v8, s[0:1] offset:6
-; GFX12-NEXT: global_load_d16_hi_b16 v4, v8, s[0:1] offset:2
+; GFX12-NEXT: global_load_u16 v3, v8, s[2:3] offset:28
+; GFX12-NEXT: global_load_u16 v2, v8, s[2:3] offset:24
+; GFX12-NEXT: global_load_u16 v1, v8, s[2:3] offset:20
+; GFX12-NEXT: global_load_u16 v0, v8, s[2:3] offset:16
+; GFX12-NEXT: global_load_u16 v7, v8, s[2:3] offset:12
+; GFX12-NEXT: global_load_u16 v6, v8, s[2:3] offset:8
+; GFX12-NEXT: global_load_u16 v5, v8, s[2:3] offset:4
+; GFX12-NEXT: global_load_u16 v4, v8, s[2:3]
+; GFX12-NEXT: global_load_d16_hi_b16 v3, v8, s[2:3] offset:30
+; GFX12-NEXT: global_load_d16_hi_b16 v2, v8, s[2:3] offset:26
+; GFX12-NEXT: global_load_d16_hi_b16 v1, v8, s[2:3] offset:22
+; GFX12-NEXT: global_load_d16_hi_b16 v0, v8, s[2:3] offset:18
+; GFX12-NEXT: global_load_d16_hi_b16 v7, v8, s[2:3] offset:14
+; GFX12-NEXT: global_load_d16_hi_b16 v6, v8, s[2:3] offset:10
+; GFX12-NEXT: global_load_d16_hi_b16 v5, v8, s[2:3] offset:6
+; GFX12-NEXT: global_load_d16_hi_b16 v4, v8, s[2:3] offset:2
; GFX12-NEXT: s_wait_loadcnt 0x4
; GFX12-NEXT: global_store_b128 v[0:1], v[0:3], off
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -808,13 +808,13 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p
;
; GCN-NOHSA-VI-LABEL: constant_zextload_i16_to_i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
; GCN-NOHSA-VI-NEXT: flat_load_ushort v2, v[0:1]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: flat_store_dword v[0:1], v2
; GCN-NOHSA-VI-NEXT: s_endpgm
@@ -837,12 +837,12 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p
;
; GFX12-LABEL: constant_zextload_i16_to_i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u16 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u16 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -885,13 +885,13 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p
;
; GCN-NOHSA-VI-LABEL: constant_sextload_i16_to_i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
; GCN-NOHSA-VI-NEXT: flat_load_sshort v2, v[0:1]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: flat_store_dword v[0:1], v2
; GCN-NOHSA-VI-NEXT: s_endpgm
@@ -915,12 +915,12 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p
;
; GFX12-LABEL: constant_sextload_i16_to_i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_i16 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_i16 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -963,13 +963,13 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_zextload_v1i16_to_v1i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
; GCN-NOHSA-VI-NEXT: flat_load_ushort v2, v[0:1]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: flat_store_dword v[0:1], v2
; GCN-NOHSA-VI-NEXT: s_endpgm
@@ -992,12 +992,12 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_zextload_v1i16_to_v1i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u16 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u16 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1040,13 +1040,13 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_sextload_v1i16_to_v1i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
; GCN-NOHSA-VI-NEXT: flat_load_sshort v2, v[0:1]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: flat_store_dword v[0:1], v2
; GCN-NOHSA-VI-NEXT: s_endpgm
@@ -1070,12 +1070,12 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_sextload_v1i16_to_v1i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_i16 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_i16 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1118,16 +1118,16 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_zextload_v2i16_to_v2i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s2, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s2, 0xffff
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s1, s0, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s0, 0xffff
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s1
; GCN-NOHSA-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
@@ -1152,16 +1152,16 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_zextload_v2i16_to_v2i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_and_b32 s3, s2, 0xffff
-; GFX12-NEXT: s_lshr_b32 s2, s2, 16
+; GFX12-NEXT: s_and_b32 s1, s0, 0xffff
+; GFX12-NEXT: s_lshr_b32 s0, s0, 16
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1205,16 +1205,16 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_sextload_v2i16_to_v2i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s0, s2, 16
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s1, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s1, s0, 16
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s0, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s1
; GCN-NOHSA-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
@@ -1240,16 +1240,16 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_sextload_v2i16_to_v2i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_sext_i32_i16 s3, s2
-; GFX12-NEXT: s_ashr_i32 s2, s2, 16
+; GFX12-NEXT: s_sext_i32_i16 s1, s0
+; GFX12-NEXT: s_ashr_i32 s0, s0, 16
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1298,18 +1298,18 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_zextload_v3i16_to_v3i32:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s1
+; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s3, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s1, s2, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, 0xffff
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s1, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s0, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s0, 0xffff
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1
; GCN-NOHSA-VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
@@ -1338,16 +1338,16 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_zextload_v3i16_to_v3i32:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_and_b32 s3, s3, 0xffff
-; GFX12-NEXT: s_and_b32 s4, s2, 0xffff
-; GFX12-NEXT: s_lshr_b32 s2, s2, 16
-; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s4
-; GFX12-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3
-; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX12-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX12-NEXT: s_and_b32 s2, s0, 0xffff
+; GFX12-NEXT: s_lshr_b32 s0, s0, 16
+; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1
+; GFX12-NEXT: global_store_b96 v3, v[0:2], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1397,17 +1397,17 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_sextload_v3i16_to_v3i32:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s1
+; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s0, s2, 16
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s1, s3
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s2, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s2, s0, 16
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s1, s1
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s0, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s2
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1
; GCN-NOHSA-VI-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; GCN-NOHSA-VI-NEXT: s_endpgm
@@ -1440,16 +1440,16 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_sextload_v3i16_to_v3i32:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_ashr_i32 s4, s2, 16
-; GFX12-NEXT: s_sext_i32_i16 s2, s2
-; GFX12-NEXT: s_sext_i32_i16 s3, s3
-; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s3
-; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX12-NEXT: s_ashr_i32 s2, s0, 16
+; GFX12-NEXT: s_sext_i32_i16 s0, s0
+; GFX12-NEXT: s_sext_i32_i16 s1, s1
+; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s0
+; GFX12-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s1
+; GFX12-NEXT: global_store_b96 v3, v[0:2], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1503,20 +1503,20 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_zextload_v4i16_to_v4i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s3, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s3, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s3, s2, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, 0xffff
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s1, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s1, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s3, s0, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s0, 0xffff
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s2
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
@@ -1545,19 +1545,19 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_zextload_v4i16_to_v4i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s4, s3, 16
-; GFX12-NEXT: s_and_b32 s3, s3, 0xffff
-; GFX12-NEXT: s_and_b32 s5, s2, 0xffff
-; GFX12-NEXT: s_lshr_b32 s2, s2, 16
+; GFX12-NEXT: s_lshr_b32 s2, s1, 16
+; GFX12-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX12-NEXT: s_and_b32 s3, s0, 0xffff
+; GFX12-NEXT: s_lshr_b32 s0, s0, 16
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v3, s4
-; GFX12-NEXT: v_mov_b32_e32 v2, s3
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v3, s2
+; GFX12-NEXT: v_mov_b32_e32 v2, s1
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1611,20 +1611,20 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_sextload_v4i16_to_v4i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s0, s3, 16
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s1, s2, 16
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s3, s3
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s2, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s2, s1, 16
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s3, s0, 16
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s1, s1
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s0, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s2
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
@@ -1655,18 +1655,18 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_sextload_v4i16_to_v4i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_ashr_i32 s4, s3, 16
-; GFX12-NEXT: s_ashr_i32 s5, s2, 16
-; GFX12-NEXT: s_sext_i32_i16 s2, s2
-; GFX12-NEXT: s_sext_i32_i16 s3, s3
-; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s4
-; GFX12-NEXT: v_mov_b32_e32 v2, s3
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: s_ashr_i32 s2, s1, 16
+; GFX12-NEXT: s_ashr_i32 s3, s0, 16
+; GFX12-NEXT: s_sext_i32_i16 s0, s0
+; GFX12-NEXT: s_sext_i32_i16 s1, s1
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s2
+; GFX12-NEXT: v_mov_b32_e32 v2, s1
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1743,34 +1743,34 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_zextload_v8i16_to_v8i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s5, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s4, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s7, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s7, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s6, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s1, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s1, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s0, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s0, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s3, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s3, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s3, s2, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, 0xffff
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s9
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s8
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s8
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: constant_zextload_v8i16_to_v8i32:
@@ -1807,26 +1807,26 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_zextload_v8i16_to_v8i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s8, s7, 16
-; GFX12-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX12-NEXT: s_and_b32 s9, s6, 0xffff
-; GFX12-NEXT: s_lshr_b32 s6, s6, 16
-; GFX12-NEXT: s_lshr_b32 s2, s5, 16
-; GFX12-NEXT: s_and_b32 s3, s5, 0xffff
-; GFX12-NEXT: s_lshr_b32 s5, s4, 16
-; GFX12-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s6
+; GFX12-NEXT: s_lshr_b32 s8, s3, 16
+; GFX12-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX12-NEXT: s_and_b32 s9, s2, 0xffff
+; GFX12-NEXT: s_lshr_b32 s2, s2, 16
+; GFX12-NEXT: s_lshr_b32 s6, s1, 16
+; GFX12-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX12-NEXT: s_lshr_b32 s7, s0, 16
+; GFX12-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: v_dual_mov_b32 v0, s9 :: v_dual_mov_b32 v3, s8
-; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s5
-; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s2
-; GFX12-NEXT: v_mov_b32_e32 v6, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v5, s7
+; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s6
+; GFX12-NEXT: v_mov_b32_e32 v6, s1
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1903,34 +1903,34 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_sextload_v8i16_to_v8i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s8, s5, 16
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s9, s4, 16
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s2, s7, 16
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s3, s6, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s7, s7
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s6, s6
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s5, s5
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s4, s4
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s9
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s8
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s6, s1, 16
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s7, s0, 16
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s8, s1
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s9, s0
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s0, s3, 16
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s1, s2, 16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s3, s3
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s2, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: constant_sextload_v8i16_to_v8i32:
@@ -1969,26 +1969,26 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_sextload_v8i16_to_v8i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_ashr_i32 s8, s7, 16
-; GFX12-NEXT: s_ashr_i32 s9, s6, 16
-; GFX12-NEXT: s_sext_i32_i16 s6, s6
-; GFX12-NEXT: s_sext_i32_i16 s7, s7
-; GFX12-NEXT: s_ashr_i32 s2, s5, 16
-; GFX12-NEXT: s_ashr_i32 s3, s4, 16
-; GFX12-NEXT: s_sext_i32_i16 s5, s5
-; GFX12-NEXT: s_sext_i32_i16 s4, s4
+; GFX12-NEXT: s_ashr_i32 s8, s3, 16
+; GFX12-NEXT: s_ashr_i32 s9, s2, 16
+; GFX12-NEXT: s_sext_i32_i16 s2, s2
+; GFX12-NEXT: s_sext_i32_i16 s3, s3
+; GFX12-NEXT: s_ashr_i32 s6, s1, 16
+; GFX12-NEXT: s_ashr_i32 s7, s0, 16
+; GFX12-NEXT: s_sext_i32_i16 s1, s1
+; GFX12-NEXT: s_sext_i32_i16 s0, s0
; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s9
-; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s8
-; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s3
-; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s2
-; GFX12-NEXT: v_mov_b32_e32 v6, s5
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s8
+; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v5, s7
+; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s6
+; GFX12-NEXT: v_mov_b32_e32 v6, s1
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2108,60 +2108,60 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) %
;
; GCN-NOHSA-VI-LABEL: constant_zextload_v16i16_to_v16i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
+; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s5, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s13, s4, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s7, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s6, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s9, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s8, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s11, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s11, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s9, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s9, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s8, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s8, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s11, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s11, 0xffff
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s10, 16
; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xffff
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 48
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 32
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s11
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s13, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s12, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s15, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s15, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s14, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xffff
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 48
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 32
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s15
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s17
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s16
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s15
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s14
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s13
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s12
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s11
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s8
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: constant_zextload_v16i16_to_v16i32:
@@ -2219,40 +2219,40 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) %
;
; GFX12-LABEL: constant_zextload_v16i16_to_v16i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0
+; GFX12-NEXT: s_load_b256 s[8:15], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s16, s11, 16
-; GFX12-NEXT: s_and_b32 s11, s11, 0xffff
-; GFX12-NEXT: s_and_b32 s17, s10, 0xffff
-; GFX12-NEXT: s_lshr_b32 s10, s10, 16
-; GFX12-NEXT: s_lshr_b32 s14, s9, 16
-; GFX12-NEXT: s_and_b32 s9, s9, 0xffff
-; GFX12-NEXT: s_lshr_b32 s15, s8, 16
-; GFX12-NEXT: s_and_b32 s8, s8, 0xffff
-; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s10
-; GFX12-NEXT: s_lshr_b32 s12, s7, 16
-; GFX12-NEXT: s_and_b32 s7, s7, 0xffff
-; GFX12-NEXT: s_lshr_b32 s13, s6, 16
-; GFX12-NEXT: s_and_b32 s6, s6, 0xffff
+; GFX12-NEXT: s_lshr_b32 s16, s15, 16
+; GFX12-NEXT: s_and_b32 s15, s15, 0xffff
+; GFX12-NEXT: s_and_b32 s17, s14, 0xffff
+; GFX12-NEXT: s_lshr_b32 s14, s14, 16
+; GFX12-NEXT: s_lshr_b32 s0, s9, 16
+; GFX12-NEXT: s_and_b32 s1, s9, 0xffff
+; GFX12-NEXT: s_lshr_b32 s2, s8, 16
+; GFX12-NEXT: s_and_b32 s3, s8, 0xffff
+; GFX12-NEXT: s_lshr_b32 s6, s11, 16
+; GFX12-NEXT: s_and_b32 s7, s11, 0xffff
+; GFX12-NEXT: s_lshr_b32 s8, s10, 16
+; GFX12-NEXT: s_and_b32 s9, s10, 0xffff
+; GFX12-NEXT: s_lshr_b32 s10, s13, 16
+; GFX12-NEXT: s_and_b32 s11, s13, 0xffff
+; GFX12-NEXT: s_lshr_b32 s13, s12, 16
+; GFX12-NEXT: s_and_b32 s12, s12, 0xffff
+; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s14
; GFX12-NEXT: v_dual_mov_b32 v0, s17 :: v_dual_mov_b32 v3, s16
-; GFX12-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v5, s15
-; GFX12-NEXT: s_lshr_b32 s2, s5, 16
-; GFX12-NEXT: s_and_b32 s3, s5, 0xffff
-; GFX12-NEXT: s_lshr_b32 s5, s4, 16
-; GFX12-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX12-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v7, s14
-; GFX12-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v9, s13
-; GFX12-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v11, s12
-; GFX12-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v13, s5
-; GFX12-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v15, s2
-; GFX12-NEXT: v_mov_b32_e32 v14, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v5, s13
+; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v7, s10
+; GFX12-NEXT: v_dual_mov_b32 v6, s11 :: v_dual_mov_b32 v9, s8
+; GFX12-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v11, s6
+; GFX12-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v13, s2
+; GFX12-NEXT: v_dual_mov_b32 v12, s3 :: v_dual_mov_b32 v15, s0
+; GFX12-NEXT: v_mov_b32_e32 v14, s1
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1]
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2372,60 +2372,60 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) %
;
; GCN-NOHSA-VI-LABEL: constant_sextload_v16i16_to_v16i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
+; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s12, s5, 16
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s13, s4, 16
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s14, s7, 16
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s15, s6, 16
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s16, s9, 16
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s17, s8, 16
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s2, s11, 16
-; GCN-NOHSA-VI-NEXT: s_ashr_i32 s3, s10, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 48
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s11, s11
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s10, s10
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 32
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s2, s9, 16
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s3, s8, 16
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s6, s9
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s7, s8
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s8, s11, 16
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s9, s10, 16
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s16, s13, 16
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s17, s12, 16
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s0, s15, 16
+; GCN-NOHSA-VI-NEXT: s_ashr_i32 s1, s14, 16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 48
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s15, s15
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s14, s14
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 32
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s9, s9
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s8, s8
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s13, s13
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s12, s12
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s17
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s16
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s7, s7
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s6, s6
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s15
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s7
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s14
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s5, s5
-; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s4, s4
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s11, s11
+; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s10, s10
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s13
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s12
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s9
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s11
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s8
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: constant_sextload_v16i16_to_v16i32:
@@ -2487,40 +2487,40 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) %
;
; GFX12-LABEL: constant_sextload_v16i16_to_v16i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0
+; GFX12-NEXT: s_load_b256 s[8:15], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_ashr_i32 s16, s11, 16
-; GFX12-NEXT: s_ashr_i32 s17, s10, 16
-; GFX12-NEXT: s_sext_i32_i16 s10, s10
-; GFX12-NEXT: s_sext_i32_i16 s11, s11
-; GFX12-NEXT: s_ashr_i32 s14, s9, 16
-; GFX12-NEXT: s_ashr_i32 s15, s8, 16
-; GFX12-NEXT: s_sext_i32_i16 s9, s9
-; GFX12-NEXT: s_sext_i32_i16 s8, s8
+; GFX12-NEXT: s_ashr_i32 s16, s15, 16
+; GFX12-NEXT: s_ashr_i32 s17, s14, 16
+; GFX12-NEXT: s_sext_i32_i16 s14, s14
+; GFX12-NEXT: s_sext_i32_i16 s15, s15
+; GFX12-NEXT: s_ashr_i32 s0, s9, 16
+; GFX12-NEXT: s_ashr_i32 s1, s8, 16
+; GFX12-NEXT: s_sext_i32_i16 s2, s9
+; GFX12-NEXT: s_sext_i32_i16 s3, s8
+; GFX12-NEXT: s_ashr_i32 s6, s11, 16
+; GFX12-NEXT: s_ashr_i32 s7, s10, 16
+; GFX12-NEXT: s_sext_i32_i16 s8, s11
+; GFX12-NEXT: s_sext_i32_i16 s9, s10
+; GFX12-NEXT: s_ashr_i32 s10, s13, 16
+; GFX12-NEXT: s_ashr_i32 s11, s12, 16
+; GFX12-NEXT: s_sext_i32_i16 s13, s13
+; GFX12-NEXT: s_sext_i32_i16 s12, s12
; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s17
-; GFX12-NEXT: s_ashr_i32 s12, s7, 16
-; GFX12-NEXT: s_ashr_i32 s13, s6, 16
-; GFX12-NEXT: s_sext_i32_i16 s7, s7
-; GFX12-NEXT: s_sext_i32_i16 s6, s6
-; GFX12-NEXT: v_dual_mov_b32 v0, s10 :: v_dual_mov_b32 v3, s16
-; GFX12-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v5, s15
-; GFX12-NEXT: s_ashr_i32 s2, s5, 16
-; GFX12-NEXT: s_ashr_i32 s3, s4, 16
-; GFX12-NEXT: s_sext_i32_i16 s5, s5
-; GFX12-NEXT: s_sext_i32_i16 s4, s4
-; GFX12-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v7, s14
-; GFX12-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v9, s13
-; GFX12-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v11, s12
-; GFX12-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v13, s3
-; GFX12-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v15, s2
-; GFX12-NEXT: v_mov_b32_e32 v14, s5
+; GFX12-NEXT: v_dual_mov_b32 v0, s14 :: v_dual_mov_b32 v3, s16
+; GFX12-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v5, s11
+; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v7, s10
+; GFX12-NEXT: v_dual_mov_b32 v6, s13 :: v_dual_mov_b32 v9, s7
+; GFX12-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v11, s6
+; GFX12-NEXT: v_dual_mov_b32 v10, s8 :: v_dual_mov_b32 v13, s1
+; GFX12-NEXT: v_dual_mov_b32 v12, s3 :: v_dual_mov_b32 v15, s0
+; GFX12-NEXT: v_mov_b32_e32 v14, s2
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1]
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5397,14 +5397,14 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p
;
; GCN-NOHSA-VI-LABEL: constant_zextload_i16_to_i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
; GCN-NOHSA-VI-NEXT: flat_load_ushort v2, v[0:1]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GCN-NOHSA-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
@@ -5429,13 +5429,13 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p
;
; GFX12-LABEL: constant_zextload_i16_to_i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v0, v1, s[2:3]
+; GFX12-NEXT: global_load_u16 v0, v1, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v1, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5485,13 +5485,13 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p
;
; GCN-NOHSA-VI-LABEL: constant_sextload_i16_to_i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
; GCN-NOHSA-VI-NEXT: flat_load_ushort v2, v[0:1]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v2, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
@@ -5519,15 +5519,15 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p
;
; GFX12-LABEL: constant_sextload_i16_to_i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v0, v2, s[2:3]
+; GFX12-NEXT: global_load_u16 v0, v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5572,14 +5572,14 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_zextload_v1i16_to_v1i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
; GCN-NOHSA-VI-NEXT: flat_load_ushort v2, v[0:1]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GCN-NOHSA-VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
@@ -5604,13 +5604,13 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_zextload_v1i16_to_v1i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v0, v1, s[2:3]
+; GFX12-NEXT: global_load_u16 v0, v1, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v1, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5655,13 +5655,13 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_sextload_v1i16_to_v1i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
; GCN-NOHSA-VI-NEXT: flat_load_ushort v2, v[0:1]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v2, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
@@ -5689,15 +5689,15 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_sextload_v1i16_to_v1i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v0, v2, s[2:3]
+; GFX12-NEXT: global_load_u16 v0, v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5744,18 +5744,18 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_zextload_v2i16_to_v2i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s2, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s2, 0xffff
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s1, s0, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s0, s0, 0xffff
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s1
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
@@ -5782,17 +5782,17 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_zextload_v2i16_to_v2i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_and_b32 s3, 0xffff, s2
+; GFX12-NEXT: s_and_b32 s1, 0xffff, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3
-; GFX12-NEXT: s_pack_hl_b32_b16 s2, s2, 0
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s1
+; GFX12-NEXT: s_pack_hl_b32_b16 s0, s0, 0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5841,19 +5841,19 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_sextload_v2i16_to_v2i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x100000
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s2, 16
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[0:1], 0x100000
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s0, 16
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s1
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
@@ -5882,17 +5882,17 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_sextload_v2i16_to_v2i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s4, s2, 16
+; GFX12-NEXT: s_lshr_b32 s2, s0, 16
+; GFX12-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000
; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000
-; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000
-; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5
-; GFX12-NEXT: v_mov_b32_e32 v2, s4
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5954,28 +5954,28 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_zextload_v4i16_to_v4i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s2, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s2, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s3, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s3, 0xffff
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s3
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s0, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s0, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s1, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s1, 0xffff
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s1
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s5
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: constant_zextload_v4i16_to_v4i64:
@@ -6009,22 +6009,22 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_zextload_v4i16_to_v4i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_and_b32 s4, 0xffff, s2
+; GFX12-NEXT: s_and_b32 s2, 0xffff, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
-; GFX12-NEXT: s_pack_hl_b32_b16 s2, s2, 0
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT: s_pack_hl_b32_b16 s0, s0, 0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT: s_pack_hl_b32_b16 s2, s3, 0
-; GFX12-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT: s_pack_hl_b32_b16 s0, s1, 0
+; GFX12-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5]
+; GFX12-NEXT: v_mov_b32_e32 v0, s1
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6094,32 +6094,32 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_sextload_v4i16_to_v4i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GCN-NOHSA-VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x100000
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s2, 16
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s3
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s3, 16
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[0:1], 0x100000
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s0, 16
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s1
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s1, 16
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s0, 16
+; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s4, 16
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s1, 0
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s5, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s5
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
@@ -6156,25 +6156,25 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_sextload_v4i16_to_v4i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s6, s3
-; GFX12-NEXT: s_lshr_b32 s8, s3, 16
-; GFX12-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x100000
-; GFX12-NEXT: s_lshr_b32 s2, s2, 16
+; GFX12-NEXT: s_mov_b32 s6, s1
+; GFX12-NEXT: s_lshr_b32 s8, s1, 16
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[0:1], 0x100000
+; GFX12-NEXT: s_lshr_b32 s0, s0, 16
; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000
; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000
-; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000
-; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v5, s7
+; GFX12-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000
+; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v5, s7
; GFX12-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v7, s9
-; GFX12-NEXT: v_dual_mov_b32 v6, s8 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: v_dual_mov_b32 v6, s8 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6266,46 +6266,46 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_zextload_v8i16_to_v8i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s4, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s5, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s6, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s7, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s7, 0xffff
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 48
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s3
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 32
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s0, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s0, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s1, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s1, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s2, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s2, s2, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s3, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s3, 0xffff
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 48
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s1
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 32
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s5
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s9
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: constant_zextload_v8i16_to_v8i64:
@@ -6357,31 +6357,32 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_zextload_v8i16_to_v8i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_and_b32 s2, 0xffff, s7
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT: s_pack_hl_b32_b16 s3, s7, 0
-; GFX12-NEXT: s_pack_hl_b32_b16 s2, s6, 0
+; GFX12-NEXT: s_and_b32 s6, 0xffff, s3
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT: s_pack_hl_b32_b16 s3, s3, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT: s_and_b32 s3, 0xffff, s6
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-NEXT: s_pack_hl_b32_b16 s2, s5, 0
-; GFX12-NEXT: s_and_b32 s3, 0xffff, s5
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-NEXT: s_pack_hl_b32_b16 s2, s4, 0
-; GFX12-NEXT: s_and_b32 s3, 0xffff, s4
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
+; GFX12-NEXT: s_pack_hl_b32_b16 s3, s2, 0
+; GFX12-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:48
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v2, s3
+; GFX12-NEXT: s_pack_hl_b32_b16 s2, s1, 0
+; GFX12-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:32
+; GFX12-NEXT: v_mov_b32_e32 v0, s1
; GFX12-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: s_pack_hl_b32_b16 s1, s0, 0
+; GFX12-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:16
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: v_mov_b32_e32 v2, s1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6492,57 +6493,57 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou
;
; GCN-NOHSA-VI-LABEL: constant_sextload_v8i16_to_v8i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[6:7], 0x100000
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s6, 16
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[6:7], 0x100000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s7
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x100000
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s4, s4, 16
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s5
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s5, 16
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[6:7], 0x100000
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s7, 16
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x100000
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s2, 16
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[2:3], 0x100000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, s3
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[0:1], 0x100000
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s0, 16
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s1
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s1, 16
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[2:3], 0x100000
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s3, 16
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000
-; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6
-; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s0, 48
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6
+; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s4, 48
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s3
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s5, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
+; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s4, 32
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s17
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7
-; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s0, 32
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s5, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
+; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s4, 16
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s13
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s15
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7
-; GCN-NOHSA-VI-NEXT: s_add_u32 s6, s0, 16
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s5, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s7, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s9
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
@@ -6598,38 +6599,38 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_sextload_v8i16_to_v8i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s14, s7
-; GFX12-NEXT: s_lshr_b32 s16, s7, 16
-; GFX12-NEXT: s_bfe_i64 s[12:13], s[6:7], 0x100000
-; GFX12-NEXT: s_lshr_b32 s6, s6, 16
-; GFX12-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x100000
-; GFX12-NEXT: s_mov_b32 s8, s5
-; GFX12-NEXT: s_lshr_b32 s10, s5, 16
+; GFX12-NEXT: s_mov_b32 s14, s3
+; GFX12-NEXT: s_lshr_b32 s16, s3, 16
+; GFX12-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x100000
+; GFX12-NEXT: s_lshr_b32 s2, s2, 16
+; GFX12-NEXT: s_bfe_i64 s[6:7], s[0:1], 0x100000
+; GFX12-NEXT: s_mov_b32 s8, s1
+; GFX12-NEXT: s_lshr_b32 s10, s1, 16
; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x100000
; GFX12-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x100000
-; GFX12-NEXT: s_lshr_b32 s4, s4, 16
-; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x100000
+; GFX12-NEXT: s_lshr_b32 s0, s0, 16
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000
; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s13
; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x100000
; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x100000
-; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v9, s15
+; GFX12-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v9, s15
; GFX12-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v11, s17
-; GFX12-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v3, s7
-; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000
-; GFX12-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v5, s3
-; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v13, s9
+; GFX12-NEXT: v_dual_mov_b32 v10, s16 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000
+; GFX12-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v5, s7
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v13, s9
; GFX12-NEXT: v_dual_mov_b32 v12, s8 :: v_dual_mov_b32 v15, s11
-; GFX12-NEXT: v_dual_mov_b32 v14, s10 :: v_dual_mov_b32 v7, s5
-; GFX12-NEXT: v_mov_b32_e32 v6, s4
+; GFX12-NEXT: v_dual_mov_b32 v14, s10 :: v_dual_mov_b32 v7, s1
+; GFX12-NEXT: v_mov_b32_e32 v6, s0
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1]
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6781,82 +6782,82 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %
;
; GCN-NOHSA-VI-LABEL: constant_zextload_v16i16_to_v16i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
+; GCN-NOHSA-VI-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s4, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s13, s5, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s6, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s7, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s10, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s11, 16
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s8, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s8, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s9, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s9, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s10, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s10, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s11, 16
; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s8, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff
-; GCN-NOHSA-VI-NEXT: s_lshr_b32 s2, s9, 16
-; GCN-NOHSA-VI-NEXT: s_and_b32 s3, s9, 0xffff
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 0x50
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s3
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 64
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s14, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s15, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s15, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s12, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xffff
+; GCN-NOHSA-VI-NEXT: s_lshr_b32 s0, s13, 16
+; GCN-NOHSA-VI-NEXT: s_and_b32 s1, s13, 0xffff
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 0x50
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s1
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 64
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 0x70
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 0x70
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s18
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 0x60
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 0x60
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s15
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s17
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 48
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s10
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 48
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s16
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 32
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
-; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
-; GCN-NOHSA-VI-NEXT: s_add_u32 s2, s0, 16
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14
-; GCN-NOHSA-VI-NEXT: s_addc_u32 s3, s1, 0
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 32
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s5
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s13
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NOHSA-VI-NEXT: s_add_u32 s0, s4, 16
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8
+; GCN-NOHSA-VI-NEXT: s_addc_u32 s1, s5, 0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s1
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s0
; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s3
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s5
+; GCN-NOHSA-VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: constant_zextload_v16i16_to_v16i64:
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
index b0d8f72c22ba7..5692d1d32c071 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
@@ -34,13 +34,13 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac
;
; GFX8-NOHSA-LABEL: constant_load_i32:
; GFX8-NOHSA: ; %bb.0: ; %entry
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NOHSA-NEXT: flat_store_dword v[0:1], v2
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -73,12 +73,12 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac
;
; GFX12-LABEL: constant_load_i32:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -117,14 +117,14 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-NOHSA-LABEL: constant_load_v2i32:
; GFX8-NOHSA: ; %bb.0: ; %entry
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -158,13 +158,13 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp
;
; GFX12-LABEL: constant_load_v2i32:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -207,15 +207,15 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-NOHSA-LABEL: constant_load_v3i32:
; GFX8-NOHSA: ; %bb.0: ; %entry
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s1
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NOHSA-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -255,13 +255,13 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp
;
; GFX12-LABEL: constant_load_v3i32:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b96 s[4:6], s[2:3], 0x0
+; GFX12-NEXT: s_load_b96 s[0:2], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s4
-; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
-; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s0
+; GFX12-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX12-NEXT: global_store_b96 v3, v[0:2], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -304,16 +304,16 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-NOHSA-LABEL: constant_load_v4i32:
; GFX8-NOHSA: ; %bb.0: ; %entry
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -349,14 +349,14 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp
;
; GFX12-LABEL: constant_load_v4i32:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
-; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -893,33 +893,33 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs
;
; GFX8-NOHSA-LABEL: constant_load_v11i32:
; GFX8-NOHSA: ; %bb.0: ; %entry
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x20
-; GFX8-NOHSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s2
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x20
+; GFX8-NOHSA-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: s_add_u32 s6, s4, 16
+; GFX8-NOHSA-NEXT: s_addc_u32 s7, s5, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s7
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s6
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s15
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s5
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s11
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s4
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 32
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s0
-; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
-; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s14
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NOHSA-NEXT: flat_store_dwordx3 v[0:1], v[4:6]
; GFX8-NOHSA-NEXT: s_endpgm
@@ -1421,14 +1421,14 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p
;
; GFX8-NOHSA-LABEL: constant_zextload_i32_to_i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -1462,12 +1462,12 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p
;
; GFX12-LABEL: constant_zextload_i32_to_i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
+; GFX12-NEXT: global_store_b64 v1, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1508,15 +1508,15 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p
;
; GFX8-NOHSA-LABEL: constant_sextload_i32_to_i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_ashr_i32 s0, s2, 31
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NOHSA-NEXT: s_ashr_i32 s1, s0, 31
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -1552,15 +1552,15 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p
;
; GFX12-LABEL: constant_sextload_i32_to_i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_ashr_i32 s3, s2, 31
+; GFX12-NEXT: s_ashr_i32 s1, s0, 31
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1599,14 +1599,14 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou
;
; GFX8-NOHSA-LABEL: constant_zextload_v1i32_to_v1i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -1640,12 +1640,12 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_zextload_v1i32_to_v1i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
+; GFX12-NEXT: global_store_b64 v1, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1686,15 +1686,15 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou
;
; GFX8-NOHSA-LABEL: constant_sextload_v1i32_to_v1i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_ashr_i32 s0, s2, 31
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NOHSA-NEXT: s_ashr_i32 s1, s0, 31
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -1730,15 +1730,15 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_sextload_v1i32_to_v1i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_ashr_i32 s3, s2, 31
+; GFX12-NEXT: s_ashr_i32 s1, s0, 31
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1781,16 +1781,16 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou
;
; GFX8-NOHSA-LABEL: constant_zextload_v2i32_to_v2i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -1829,14 +1829,14 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_zextload_v2i32_to_v2i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1884,19 +1884,18 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou
;
; GFX8-NOHSA-LABEL: constant_sextload_v2i32_to_v2i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_ashr_i32 s0, s3, 31
-; GFX8-NOHSA-NEXT: s_mov_b32 s1, s3
-; GFX8-NOHSA-NEXT: s_ashr_i32 s3, s2, 31
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NOHSA-NEXT: s_ashr_i32 s2, s1, 31
+; GFX8-NOHSA-NEXT: s_ashr_i32 s3, s0, 31
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -1940,17 +1939,17 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_sextload_v2i32_to_v2i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_ashr_i32 s4, s3, 31
-; GFX12-NEXT: s_ashr_i32 s5, s2, 31
+; GFX12-NEXT: s_ashr_i32 s2, s1, 31
+; GFX12-NEXT: s_ashr_i32 s3, s0, 31
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s4
-; GFX12-NEXT: v_mov_b32_e32 v2, s3
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s2
+; GFX12-NEXT: v_mov_b32_e32 v2, s1
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2004,23 +2003,23 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou
;
; GFX8-NOHSA-LABEL: constant_zextload_v4i32_to_v4i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: s_add_u32 s6, s4, 16
+; GFX8-NOHSA-NEXT: s_addc_u32 s7, s5, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -2070,17 +2069,17 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_zextload_v4i32_to_v4i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, s7
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: v_mov_b32_e32 v2, s5
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, s3
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:16
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: v_mov_b32_e32 v2, s1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2145,29 +2144,29 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou
;
; GFX8-NOHSA-LABEL: constant_sextload_v4i32_to_v4i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_ashr_i32 s8, s5, 31
-; GFX8-NOHSA-NEXT: s_ashr_i32 s9, s4, 31
-; GFX8-NOHSA-NEXT: s_ashr_i32 s2, s7, 31
-; GFX8-NOHSA-NEXT: s_ashr_i32 s3, s6, 31
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT: s_ashr_i32 s6, s1, 31
+; GFX8-NOHSA-NEXT: s_ashr_i32 s7, s0, 31
+; GFX8-NOHSA-NEXT: s_ashr_i32 s8, s3, 31
+; GFX8-NOHSA-NEXT: s_ashr_i32 s9, s2, 31
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 16
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s2
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -2226,22 +2225,22 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_sextload_v4i32_to_v4i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_ashr_i32 s8, s7, 31
-; GFX12-NEXT: s_ashr_i32 s9, s6, 31
-; GFX12-NEXT: s_ashr_i32 s2, s5, 31
-; GFX12-NEXT: s_ashr_i32 s3, s4, 31
+; GFX12-NEXT: s_ashr_i32 s8, s3, 31
+; GFX12-NEXT: s_ashr_i32 s9, s2, 31
+; GFX12-NEXT: s_ashr_i32 s6, s1, 31
+; GFX12-NEXT: s_ashr_i32 s7, s0, 31
; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v1, s9
-; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s8
-; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v5, s3
-; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v7, s2
-; GFX12-NEXT: v_mov_b32_e32 v6, s5
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s8
+; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v5, s7
+; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v7, s6
+; GFX12-NEXT: v_mov_b32_e32 v6, s1
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2683,32 +2682,32 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou
;
; GFX12-LABEL: constant_sextload_v8i32_to_v8i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0
+; GFX12-NEXT: s_load_b256 s[8:15], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_ashr_i32 s16, s11, 31
-; GFX12-NEXT: s_ashr_i32 s17, s10, 31
-; GFX12-NEXT: s_ashr_i32 s14, s9, 31
-; GFX12-NEXT: s_ashr_i32 s15, s8, 31
+; GFX12-NEXT: s_ashr_i32 s16, s15, 31
+; GFX12-NEXT: s_ashr_i32 s17, s14, 31
+; GFX12-NEXT: s_ashr_i32 s6, s13, 31
+; GFX12-NEXT: s_ashr_i32 s7, s12, 31
; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v1, s17
-; GFX12-NEXT: s_ashr_i32 s12, s7, 31
-; GFX12-NEXT: s_ashr_i32 s13, s6, 31
-; GFX12-NEXT: v_dual_mov_b32 v0, s10 :: v_dual_mov_b32 v3, s16
-; GFX12-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v5, s15
-; GFX12-NEXT: s_ashr_i32 s2, s5, 31
-; GFX12-NEXT: s_ashr_i32 s3, s4, 31
-; GFX12-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v7, s14
-; GFX12-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v9, s13
-; GFX12-NEXT: v_dual_mov_b32 v8, s6 :: v_dual_mov_b32 v11, s12
-; GFX12-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v13, s3
-; GFX12-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v15, s2
-; GFX12-NEXT: v_mov_b32_e32 v14, s5
+; GFX12-NEXT: s_ashr_i32 s2, s11, 31
+; GFX12-NEXT: s_ashr_i32 s3, s10, 31
+; GFX12-NEXT: v_dual_mov_b32 v0, s14 :: v_dual_mov_b32 v3, s16
+; GFX12-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v5, s7
+; GFX12-NEXT: s_ashr_i32 s0, s9, 31
+; GFX12-NEXT: s_ashr_i32 s1, s8, 31
+; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v7, s6
+; GFX12-NEXT: v_dual_mov_b32 v6, s13 :: v_dual_mov_b32 v9, s3
+; GFX12-NEXT: v_dual_mov_b32 v8, s10 :: v_dual_mov_b32 v11, s2
+; GFX12-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v13, s1
+; GFX12-NEXT: v_dual_mov_b32 v12, s8 :: v_dual_mov_b32 v15, s0
+; GFX12-NEXT: v_mov_b32_e32 v14, s9
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1]
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
index 66c73fda38743..94325846f86f2 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
@@ -34,14 +34,14 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac
;
; GFX8-LABEL: constant_load_i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
@@ -63,13 +63,13 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac
;
; GFX12-LABEL: constant_load_i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -111,16 +111,16 @@ define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-LABEL: constant_load_v2i64:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_mov_b32_e32 v2, s6
-; GFX8-NEXT: v_mov_b32_e32 v3, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
;
@@ -142,14 +142,14 @@ define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrsp
;
; GFX12-LABEL: constant_load_v2i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
-; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -205,24 +205,24 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-LABEL: constant_load_v3i64:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x10
-; GFX8-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GFX8-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NEXT: v_mov_b32_e32 v4, s3
-; GFX8-NEXT: v_mov_b32_e32 v3, s2
+; GFX8-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x10
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX8-NEXT: s_add_u32 s6, s4, 16
+; GFX8-NEXT: s_addc_u32 s7, s5, 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NEXT: v_mov_b32_e32 v4, s7
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v5, s8
; GFX8-NEXT: v_mov_b32_e32 v6, s9
; GFX8-NEXT: flat_store_dwordx2 v[3:4], v[5:6]
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: v_mov_b32_e32 v2, s6
-; GFX8-NEXT: v_mov_b32_e32 v3, s7
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
;
@@ -253,19 +253,19 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp
;
; GFX12-LABEL: constant_load_v3i64:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b64 s[8:9], s[2:3], 0x10
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[8:9], s[6:7], 0x10
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v5, s9
-; GFX12-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v1, s5
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
-; GFX12-NEXT: v_mov_b32_e32 v2, s6
+; GFX12-NEXT: v_dual_mov_b32 v4, s8 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v6, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b64 v6, v[4:5], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v6, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index 889755c23bbc7..29ca6c6cae38d 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -39,13 +39,13 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace
;
; GFX8-NOHSA-LABEL: constant_load_i8:
; GFX8-NOHSA: ; %bb.0: ; %entry
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: flat_store_byte v[0:1], v2
; GFX8-NOHSA-NEXT: s_endpgm
@@ -78,12 +78,12 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace
;
; GFX12-LABEL: constant_load_i8:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u8 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b8 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b8 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -126,13 +126,13 @@ define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspa
;
; GFX8-NOHSA-LABEL: constant_load_v2i8:
; GFX8-NOHSA: ; %bb.0: ; %entry
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ushort v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: flat_store_short v[0:1], v2
; GFX8-NOHSA-NEXT: s_endpgm
@@ -165,12 +165,12 @@ define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspa
;
; GFX12-LABEL: constant_load_v2i8:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -217,14 +217,14 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa
;
; GFX8-NOHSA-LABEL: constant_load_v3i8:
; GFX8-NOHSA: ; %bb.0: ; %entry
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 2
-; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NOHSA-NEXT: s_load_dword s2, s[6:7], 0x0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 2
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
@@ -278,14 +278,14 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa
;
; GFX12-LABEL: constant_load_v3i8:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_d16_hi_b8 v0, v1, s[0:1] offset:2
-; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_d16_hi_b8 v0, v1, s[4:5] offset:2
+; GFX12-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -322,13 +322,13 @@ define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspa
;
; GFX8-NOHSA-LABEL: constant_load_v4i8:
; GFX8-NOHSA: ; %bb.0: ; %entry
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NOHSA-NEXT: flat_store_dword v[0:1], v2
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -350,12 +350,12 @@ define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspa
;
; GFX12-LABEL: constant_load_v4i8:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -394,14 +394,14 @@ define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspa
;
; GFX8-NOHSA-LABEL: constant_load_v8i8:
; GFX8-NOHSA: ; %bb.0: ; %entry
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -423,13 +423,13 @@ define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspa
;
; GFX12-LABEL: constant_load_v8i8:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -472,16 +472,16 @@ define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrsp
;
; GFX8-NOHSA-LABEL: constant_load_v16i8:
; GFX8-NOHSA: ; %bb.0: ; %entry
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s7
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -503,14 +503,14 @@ define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrsp
;
; GFX12-LABEL: constant_load_v16i8:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s7
-; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -553,13 +553,13 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt
;
; GFX8-NOHSA-LABEL: constant_zextload_i8_to_i32:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: flat_store_dword v[0:1], v2
; GFX8-NOHSA-NEXT: s_endpgm
@@ -582,12 +582,12 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt
;
; GFX12-LABEL: constant_zextload_i8_to_i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -630,13 +630,13 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt
;
; GFX8-NOHSA-LABEL: constant_sextload_i8_to_i32:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_sbyte v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: flat_store_dword v[0:1], v2
; GFX8-NOHSA-NEXT: s_endpgm
@@ -660,12 +660,12 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt
;
; GFX12-LABEL: constant_sextload_i8_to_i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_i8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_i8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -708,13 +708,13 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_zextload_v1i8_to_v1i32:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: flat_store_dword v[0:1], v2
; GFX8-NOHSA-NEXT: s_endpgm
@@ -737,12 +737,12 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v1i8_to_v1i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -785,13 +785,13 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_sextload_v1i8_to_v1i32:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_sbyte v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: flat_store_dword v[0:1], v2
; GFX8-NOHSA-NEXT: s_endpgm
@@ -815,12 +815,12 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v1i8_to_v1i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_i8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_i8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -868,13 +868,13 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_zextload_v2i8_to_v2i32:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ushort v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: v_lshrrev_b16_e32 v3, 8, v2
; GFX8-NOHSA-NEXT: v_and_b32_e32 v2, 0xff, v2
@@ -911,16 +911,16 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v2i8_to_v2i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v0, v2, s[2:3]
+; GFX12-NEXT: global_load_u16 v0, v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 8, v0
; GFX12-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -968,13 +968,13 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_sextload_v2i8_to_v2i32:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ushort v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: v_lshrrev_b16_e32 v3, 8, v2
; GFX8-NOHSA-NEXT: v_bfe_i32 v2, v2, 0, 8
@@ -1011,16 +1011,16 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v2i8_to_v2i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v0, v2, s[2:3]
+; GFX12-NEXT: global_load_u16 v0, v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 8, v0
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1070,17 +1070,17 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_zextload_v3i8_to_v3i32:
; GFX8-NOHSA: ; %bb.0: ; %entry
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s1
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_and_b32 s0, s2, 0xff
-; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s2, 0x80010
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NOHSA-NEXT: s_and_b32 s1, s0, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s0
+; GFX8-NOHSA-NEXT: s_bfe_u32 s0, s0, 0x80010
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -1112,17 +1112,17 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v3i8_to_v3i32:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2
-; GFX12-NEXT: s_and_b32 s3, s2, 0xff
-; GFX12-NEXT: s_bfe_u32 s2, s2, 0x80010
-; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s0
+; GFX12-NEXT: s_and_b32 s1, s0, 0xff
+; GFX12-NEXT: s_bfe_u32 s0, s0, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_and_b32 v1, 0xffff, v1
-; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_and_b32 v1, 0xffff, v1
+; GFX12-NEXT: global_store_b96 v3, v[0:2], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1173,18 +1173,18 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_sextload_v3i8_to_v3i32:
; GFX8-NOHSA: ; %bb.0: ; %entry
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s1
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s2
-; GFX8-NOHSA-NEXT: s_bfe_i32 s0, s2, 0x80010
-; GFX8-NOHSA-NEXT: s_sext_i32_i8 s1, s2
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s0
+; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s0, 0x80010
+; GFX8-NOHSA-NEXT: s_sext_i32_i8 s0, s0
; GFX8-NOHSA-NEXT: v_bfe_i32 v1, v0, 0, 8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NOHSA-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -1216,18 +1216,18 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v3i8_to_v3i32:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2
-; GFX12-NEXT: s_sext_i32_i8 s3, s2
-; GFX12-NEXT: s_bfe_i32 s2, s2, 0x80010
-; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s3
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s0
+; GFX12-NEXT: s_sext_i32_i8 s1, s0
+; GFX12-NEXT: s_bfe_i32 s0, s0, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b96 v3, v[0:2], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1280,19 +1280,19 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_zextload_v4i8_to_v4i32:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 24
-; GFX8-NOHSA-NEXT: s_and_b32 s1, s2, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s2
-; GFX8-NOHSA-NEXT: s_bfe_u32 s2, s2, 0x80010
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NOHSA-NEXT: s_lshr_b32 s1, s0, 24
+; GFX8-NOHSA-NEXT: s_and_b32 s2, s0, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s0
+; GFX8-NOHSA-NEXT: s_bfe_u32 s0, s0, 0x80010
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -1322,19 +1322,19 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v4i8_to_v4i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2
-; GFX12-NEXT: s_lshr_b32 s3, s2, 24
-; GFX12-NEXT: s_and_b32 s4, s2, 0xff
-; GFX12-NEXT: s_bfe_u32 s2, s2, 0x80010
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s0
+; GFX12-NEXT: s_lshr_b32 s1, s0, 24
+; GFX12-NEXT: s_and_b32 s2, s0, 0xff
+; GFX12-NEXT: s_bfe_u32 s0, s0, 0x80010
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_and_b32 v1, 0xffff, v1
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1386,20 +1386,20 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_sextload_v4i8_to_v4i32:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s2
-; GFX8-NOHSA-NEXT: s_ashr_i32 s0, s2, 24
-; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s2, 0x80010
-; GFX8-NOHSA-NEXT: s_sext_i32_i8 s2, s2
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s0
+; GFX8-NOHSA-NEXT: s_ashr_i32 s1, s0, 24
+; GFX8-NOHSA-NEXT: s_bfe_i32 s2, s0, 0x80010
+; GFX8-NOHSA-NEXT: s_sext_i32_i8 s0, s0
; GFX8-NOHSA-NEXT: v_bfe_i32 v1, v0, 0, 8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -1431,19 +1431,19 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v4i8_to_v4i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2
-; GFX12-NEXT: s_ashr_i32 s3, s2, 24
-; GFX12-NEXT: s_sext_i32_i8 s4, s2
-; GFX12-NEXT: s_bfe_i32 s2, s2, 0x80010
-; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s0
+; GFX12-NEXT: s_ashr_i32 s1, s0, 24
+; GFX12-NEXT: s_sext_i32_i8 s2, s0
+; GFX12-NEXT: s_bfe_i32 s0, s0, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1518,30 +1518,30 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_zextload_v8i8_to_v8i32:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s0
+; GFX8-NOHSA-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s5
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s4
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s3, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s5, s2, 24
-; GFX8-NOHSA-NEXT: s_and_b32 s6, s3, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s3
-; GFX8-NOHSA-NEXT: s_bfe_u32 s3, s3, 0x80010
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v3, 8, s2
-; GFX8-NOHSA-NEXT: s_and_b32 s7, s2, 0xff
-; GFX8-NOHSA-NEXT: s_bfe_u32 s2, s2, 0x80010
-; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 16
+; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s1, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s0, 24
+; GFX8-NOHSA-NEXT: s_and_b32 s6, s1, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s1
+; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s1, 0x80010
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v3, 8, s0
+; GFX8-NOHSA-NEXT: s_and_b32 s7, s0, 0xff
+; GFX8-NOHSA-NEXT: s_bfe_u32 s0, s0, 0x80010
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
-; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 16
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
@@ -1583,26 +1583,26 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v8i8_to_v8i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2
-; GFX12-NEXT: v_lshrrev_b16 v5, 8, s3
-; GFX12-NEXT: s_lshr_b32 s5, s2, 24
-; GFX12-NEXT: s_and_b32 s7, s2, 0xff
-; GFX12-NEXT: s_bfe_u32 s2, s2, 0x80010
-; GFX12-NEXT: s_lshr_b32 s4, s3, 24
-; GFX12-NEXT: s_and_b32 s6, s3, 0xff
-; GFX12-NEXT: s_bfe_u32 s3, s3, 0x80010
-; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v3, s5
-; GFX12-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v7, s4
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_and_b32 v1, 0xffff, v1
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s0
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s1
+; GFX12-NEXT: s_lshr_b32 s3, s0, 24
+; GFX12-NEXT: s_and_b32 s7, s0, 0xff
+; GFX12-NEXT: s_bfe_u32 s0, s0, 0x80010
+; GFX12-NEXT: s_lshr_b32 s2, s1, 24
+; GFX12-NEXT: s_and_b32 s6, s1, 0xff
+; GFX12-NEXT: s_bfe_u32 s1, s1, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v7, s2
+; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_and_b32 v1, 0xffff, v1
; GFX12-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_and_b32 v5, 0xffff, v5
-; GFX12-NEXT: v_mov_b32_e32 v6, s3
+; GFX12-NEXT: v_mov_b32_e32 v6, s1
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5]
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1677,32 +1677,32 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_sextload_v8i8_to_v8i32:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s0
+; GFX8-NOHSA-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s5
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s4
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_ashr_i32 s4, s3, 24
-; GFX8-NOHSA-NEXT: s_bfe_i32 s5, s3, 0x80010
-; GFX8-NOHSA-NEXT: s_ashr_i32 s6, s2, 24
-; GFX8-NOHSA-NEXT: s_bfe_i32 s7, s2, 0x80010
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s2
-; GFX8-NOHSA-NEXT: s_sext_i32_i8 s2, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 16
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s0
+; GFX8-NOHSA-NEXT: s_ashr_i32 s2, s1, 24
+; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s1, 0x80010
+; GFX8-NOHSA-NEXT: s_ashr_i32 s6, s0, 24
+; GFX8-NOHSA-NEXT: s_bfe_i32 s7, s0, 0x80010
+; GFX8-NOHSA-NEXT: s_sext_i32_i8 s0, s0
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s1
+; GFX8-NOHSA-NEXT: s_sext_i32_i8 s1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 16
; GFX8-NOHSA-NEXT: v_bfe_i32 v3, v0, 0, 8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s7
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s6
-; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s3
-; GFX8-NOHSA-NEXT: s_sext_i32_i8 s3, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s1
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5]
; GFX8-NOHSA-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
@@ -1747,28 +1747,28 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v8i8_to_v8i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2
-; GFX12-NEXT: v_lshrrev_b16 v5, 8, s3
-; GFX12-NEXT: s_ashr_i32 s6, s2, 24
-; GFX12-NEXT: s_sext_i32_i8 s7, s2
-; GFX12-NEXT: s_bfe_i32 s2, s2, 0x80010
-; GFX12-NEXT: s_ashr_i32 s4, s3, 24
-; GFX12-NEXT: s_bfe_i32 s5, s3, 0x80010
-; GFX12-NEXT: s_sext_i32_i8 s3, s3
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s0
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s1
+; GFX12-NEXT: s_ashr_i32 s6, s0, 24
+; GFX12-NEXT: s_sext_i32_i8 s7, s0
+; GFX12-NEXT: s_bfe_i32 s0, s0, 0x80010
+; GFX12-NEXT: s_ashr_i32 s2, s1, 24
+; GFX12-NEXT: s_bfe_i32 s3, s1, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s1, s1
; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v3, s6
-; GFX12-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v7, s4
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v7, s2
+; GFX12-NEXT: v_mov_b32_e32 v2, s0
; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX12-NEXT: v_mov_b32_e32 v4, s3
-; GFX12-NEXT: v_mov_b32_e32 v6, s5
+; GFX12-NEXT: v_mov_b32_e32 v4, s1
+; GFX12-NEXT: v_mov_b32_e32 v6, s3
; GFX12-NEXT: v_bfe_i32 v5, v5, 0, 8
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5]
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1889,55 +1889,55 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o
;
; GFX8-NOHSA-LABEL: constant_zextload_v16i8_to_v16i32:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s4, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s9, s5, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s6, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s7, 24
-; GFX8-NOHSA-NEXT: s_and_b32 s11, s4, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s4
-; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s4, 0x80010
-; GFX8-NOHSA-NEXT: s_and_b32 s12, s5, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v3, 8, s5
-; GFX8-NOHSA-NEXT: s_bfe_u32 s5, s5, 0x80010
-; GFX8-NOHSA-NEXT: s_and_b32 s13, s6, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v5, 8, s6
-; GFX8-NOHSA-NEXT: s_bfe_u32 s6, s6, 0x80010
-; GFX8-NOHSA-NEXT: s_and_b32 s3, s7, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s7
-; GFX8-NOHSA-NEXT: s_bfe_u32 s7, s7, 0x80010
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s7
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s0, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s7, s1, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s2, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s9, s3, 24
+; GFX8-NOHSA-NEXT: s_and_b32 s10, s0, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s0
+; GFX8-NOHSA-NEXT: s_bfe_u32 s11, s0, 0x80010
+; GFX8-NOHSA-NEXT: s_and_b32 s12, s1, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v3, 8, s1
+; GFX8-NOHSA-NEXT: s_bfe_u32 s13, s1, 0x80010
+; GFX8-NOHSA-NEXT: s_and_b32 s14, s2, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v5, 8, s2
+; GFX8-NOHSA-NEXT: s_bfe_u32 s2, s2, 0x80010
+; GFX8-NOHSA-NEXT: s_and_b32 s0, s3, 0xff
+; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s3, 0x80010
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 48
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s1
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 32
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s9
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[6:9]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s13
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s10
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s14
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s8
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s13
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s7
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s11
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s11
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -1999,40 +1999,40 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_zextload_v16i8_to_v16i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v1, 8, s7
-; GFX12-NEXT: v_lshrrev_b16 v5, 8, s6
-; GFX12-NEXT: v_lshrrev_b16 v9, 8, s5
-; GFX12-NEXT: s_lshr_b32 s8, s6, 24
-; GFX12-NEXT: s_lshr_b32 s9, s7, 24
-; GFX12-NEXT: v_lshrrev_b16 v13, 8, s4
-; GFX12-NEXT: s_and_b32 s12, s6, 0xff
-; GFX12-NEXT: s_bfe_u32 s6, s6, 0x80010
-; GFX12-NEXT: s_and_b32 s13, s7, 0xff
-; GFX12-NEXT: s_bfe_u32 s7, s7, 0x80010
-; GFX12-NEXT: s_and_b32 s11, s5, 0xff
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s3
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s2
+; GFX12-NEXT: v_lshrrev_b16 v9, 8, s1
+; GFX12-NEXT: s_lshr_b32 s8, s2, 24
+; GFX12-NEXT: s_lshr_b32 s9, s3, 24
+; GFX12-NEXT: v_lshrrev_b16 v13, 8, s0
+; GFX12-NEXT: s_and_b32 s12, s2, 0xff
+; GFX12-NEXT: s_bfe_u32 s2, s2, 0x80010
+; GFX12-NEXT: s_and_b32 s13, s3, 0xff
+; GFX12-NEXT: s_bfe_u32 s3, s3, 0x80010
+; GFX12-NEXT: s_and_b32 s11, s1, 0xff
; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v3, s9
-; GFX12-NEXT: s_lshr_b32 s3, s5, 24
-; GFX12-NEXT: s_bfe_u32 s5, s5, 0x80010
+; GFX12-NEXT: s_lshr_b32 s7, s1, 24
+; GFX12-NEXT: s_bfe_u32 s1, s1, 0x80010
; GFX12-NEXT: v_dual_mov_b32 v0, s13 :: v_dual_mov_b32 v7, s8
-; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v11, s3
-; GFX12-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_and_b32 v1, 0xffff, v1
-; GFX12-NEXT: s_lshr_b32 s2, s4, 24
-; GFX12-NEXT: s_and_b32 s10, s4, 0xff
-; GFX12-NEXT: s_bfe_u32 s4, s4, 0x80010
-; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v15, s2
+; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v11, s7
+; GFX12-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_and_b32 v1, 0xffff, v1
+; GFX12-NEXT: s_lshr_b32 s6, s0, 24
+; GFX12-NEXT: s_and_b32 s10, s0, 0xff
+; GFX12-NEXT: s_bfe_u32 s0, s0, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v15, s6
; GFX12-NEXT: v_dual_mov_b32 v8, s11 :: v_dual_and_b32 v5, 0xffff, v5
-; GFX12-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_and_b32 v9, 0xffff, v9
+; GFX12-NEXT: v_dual_mov_b32 v10, s1 :: v_dual_and_b32 v9, 0xffff, v9
; GFX12-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_and_b32 v13, 0xffff, v13
-; GFX12-NEXT: v_mov_b32_e32 v14, s4
+; GFX12-NEXT: v_mov_b32_e32 v14, s0
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1]
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2153,59 +2153,59 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o
;
; GFX8-NOHSA-LABEL: constant_sextload_v16i8_to_v16i32:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_ashr_i32 s8, s4, 24
-; GFX8-NOHSA-NEXT: s_bfe_i32 s9, s4, 0x80010
-; GFX8-NOHSA-NEXT: s_ashr_i32 s10, s5, 24
-; GFX8-NOHSA-NEXT: s_bfe_i32 s11, s5, 0x80010
-; GFX8-NOHSA-NEXT: s_ashr_i32 s12, s6, 24
-; GFX8-NOHSA-NEXT: s_bfe_i32 s13, s6, 0x80010
-; GFX8-NOHSA-NEXT: s_ashr_i32 s2, s7, 24
-; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s7, 0x80010
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s3
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s7
-; GFX8-NOHSA-NEXT: s_sext_i32_i8 s7, s7
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v3, 8, s1
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s0
+; GFX8-NOHSA-NEXT: s_ashr_i32 s6, s0, 24
+; GFX8-NOHSA-NEXT: s_bfe_i32 s7, s0, 0x80010
+; GFX8-NOHSA-NEXT: s_sext_i32_i8 s8, s0
+; GFX8-NOHSA-NEXT: s_ashr_i32 s9, s1, 24
+; GFX8-NOHSA-NEXT: s_bfe_i32 s10, s1, 0x80010
+; GFX8-NOHSA-NEXT: s_sext_i32_i8 s11, s1
+; GFX8-NOHSA-NEXT: s_ashr_i32 s12, s2, 24
+; GFX8-NOHSA-NEXT: s_bfe_i32 s13, s2, 0x80010
+; GFX8-NOHSA-NEXT: s_ashr_i32 s0, s3, 24
+; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s3, 0x80010
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 48
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s1
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s1
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s3
+; GFX8-NOHSA-NEXT: s_sext_i32_i8 s3, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 32
; GFX8-NOHSA-NEXT: v_bfe_i32 v7, v0, 0, 8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s7
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[6:9]
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s3
-; GFX8-NOHSA-NEXT: s_sext_i32_i8 s6, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s1
+; GFX8-NOHSA-NEXT: s_sext_i32_i8 s2, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 16
; GFX8-NOHSA-NEXT: v_bfe_i32 v5, v2, 0, 8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s13
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s12
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v3, 8, s5
-; GFX8-NOHSA-NEXT: s_sext_i32_i8 s5, s5
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GFX8-NOHSA-NEXT: v_bfe_i32 v3, v3, 0, 8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s11
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s10
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s4
-; GFX8-NOHSA-NEXT: s_sext_i32_i8 s4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s11
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s10
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5]
; GFX8-NOHSA-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s9
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -2275,44 +2275,44 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_sextload_v16i8_to_v16i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v1, 8, s7
-; GFX12-NEXT: v_lshrrev_b16 v5, 8, s6
-; GFX12-NEXT: v_lshrrev_b16 v9, 8, s5
-; GFX12-NEXT: v_lshrrev_b16 v13, 8, s4
-; GFX12-NEXT: s_ashr_i32 s12, s7, 24
-; GFX12-NEXT: s_sext_i32_i8 s13, s7
-; GFX12-NEXT: s_bfe_i32 s7, s7, 0x80010
-; GFX12-NEXT: s_ashr_i32 s10, s6, 24
-; GFX12-NEXT: s_bfe_i32 s11, s6, 0x80010
-; GFX12-NEXT: s_sext_i32_i8 s6, s6
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s3
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s2
+; GFX12-NEXT: v_lshrrev_b16 v9, 8, s1
+; GFX12-NEXT: v_lshrrev_b16 v13, 8, s0
+; GFX12-NEXT: s_ashr_i32 s12, s3, 24
+; GFX12-NEXT: s_sext_i32_i8 s13, s3
+; GFX12-NEXT: s_bfe_i32 s3, s3, 0x80010
+; GFX12-NEXT: s_ashr_i32 s10, s2, 24
+; GFX12-NEXT: s_bfe_i32 s11, s2, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s2, s2
; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v3, s12
-; GFX12-NEXT: s_ashr_i32 s8, s5, 24
-; GFX12-NEXT: s_bfe_i32 s9, s5, 0x80010
-; GFX12-NEXT: s_sext_i32_i8 s5, s5
+; GFX12-NEXT: s_ashr_i32 s8, s1, 24
+; GFX12-NEXT: s_bfe_i32 s9, s1, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s1, s1
; GFX12-NEXT: v_dual_mov_b32 v0, s13 :: v_dual_mov_b32 v7, s10
-; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v11, s8
+; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v11, s8
; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX12-NEXT: s_ashr_i32 s2, s4, 24
-; GFX12-NEXT: s_bfe_i32 s3, s4, 0x80010
-; GFX12-NEXT: s_sext_i32_i8 s4, s4
-; GFX12-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v15, s2
+; GFX12-NEXT: s_ashr_i32 s6, s0, 24
+; GFX12-NEXT: s_bfe_i32 s7, s0, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s0, s0
+; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v15, s6
; GFX12-NEXT: v_mov_b32_e32 v6, s11
; GFX12-NEXT: v_bfe_i32 v5, v5, 0, 8
-; GFX12-NEXT: v_mov_b32_e32 v8, s5
+; GFX12-NEXT: v_mov_b32_e32 v8, s1
; GFX12-NEXT: v_mov_b32_e32 v10, s9
; GFX12-NEXT: v_bfe_i32 v9, v9, 0, 8
-; GFX12-NEXT: v_mov_b32_e32 v12, s4
-; GFX12-NEXT: v_mov_b32_e32 v14, s3
+; GFX12-NEXT: v_mov_b32_e32 v12, s0
+; GFX12-NEXT: v_mov_b32_e32 v14, s7
; GFX12-NEXT: v_bfe_i32 v13, v13, 0, 8
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1]
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2724,71 +2724,71 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_zextload_v32i8_to_v32i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0
+; GFX12-NEXT: s_load_b256 s[8:15], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v1, 8, s11
-; GFX12-NEXT: v_lshrrev_b16 v5, 8, s10
-; GFX12-NEXT: v_lshrrev_b16 v9, 8, s9
-; GFX12-NEXT: v_lshrrev_b16 v10, 8, s8
-; GFX12-NEXT: v_lshrrev_b16 v11, 8, s7
-; GFX12-NEXT: s_lshr_b32 s15, s9, 24
-; GFX12-NEXT: s_lshr_b32 s17, s11, 24
-; GFX12-NEXT: v_lshrrev_b16 v12, 8, s6
-; GFX12-NEXT: s_and_b32 s23, s9, 0xff
-; GFX12-NEXT: s_bfe_u32 s9, s9, 0x80010
-; GFX12-NEXT: s_and_b32 s25, s11, 0xff
-; GFX12-NEXT: s_bfe_u32 s11, s11, 0x80010
-; GFX12-NEXT: s_lshr_b32 s14, s8, 24
-; GFX12-NEXT: s_lshr_b32 s16, s10, 24
-; GFX12-NEXT: v_lshrrev_b16 v14, 8, s5
-; GFX12-NEXT: s_and_b32 s22, s8, 0xff
-; GFX12-NEXT: s_bfe_u32 s8, s8, 0x80010
-; GFX12-NEXT: s_and_b32 s24, s10, 0xff
-; GFX12-NEXT: s_bfe_u32 s10, s10, 0x80010
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s15
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s14
+; GFX12-NEXT: v_lshrrev_b16 v9, 8, s13
+; GFX12-NEXT: v_lshrrev_b16 v10, 8, s12
+; GFX12-NEXT: v_lshrrev_b16 v11, 8, s11
+; GFX12-NEXT: s_lshr_b32 s7, s13, 24
+; GFX12-NEXT: s_lshr_b32 s17, s15, 24
+; GFX12-NEXT: v_lshrrev_b16 v12, 8, s10
+; GFX12-NEXT: s_and_b32 s23, s13, 0xff
+; GFX12-NEXT: s_bfe_u32 s13, s13, 0x80010
+; GFX12-NEXT: s_and_b32 s25, s15, 0xff
+; GFX12-NEXT: s_bfe_u32 s15, s15, 0x80010
+; GFX12-NEXT: s_lshr_b32 s6, s12, 24
+; GFX12-NEXT: s_lshr_b32 s16, s14, 24
+; GFX12-NEXT: v_lshrrev_b16 v14, 8, s9
+; GFX12-NEXT: s_and_b32 s22, s12, 0xff
+; GFX12-NEXT: s_bfe_u32 s12, s12, 0x80010
+; GFX12-NEXT: s_and_b32 s24, s14, 0xff
+; GFX12-NEXT: s_bfe_u32 s14, s14, 0x80010
; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v3, s17
-; GFX12-NEXT: s_lshr_b32 s13, s7, 24
-; GFX12-NEXT: v_lshrrev_b16 v13, 8, s4
-; GFX12-NEXT: s_and_b32 s21, s7, 0xff
-; GFX12-NEXT: s_bfe_u32 s7, s7, 0x80010
+; GFX12-NEXT: s_lshr_b32 s3, s11, 24
+; GFX12-NEXT: v_lshrrev_b16 v13, 8, s8
+; GFX12-NEXT: s_and_b32 s21, s11, 0xff
+; GFX12-NEXT: s_bfe_u32 s11, s11, 0x80010
; GFX12-NEXT: v_dual_mov_b32 v0, s25 :: v_dual_mov_b32 v7, s16
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_and_b32 v13, 0xffff, v13
+; GFX12-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_and_b32 v13, 0xffff, v13
; GFX12-NEXT: v_dual_mov_b32 v8, s23 :: v_dual_and_b32 v1, 0xffff, v1
; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX12-NEXT: v_dual_mov_b32 v28, s22 :: v_dual_and_b32 v25, 0xffff, v11
-; GFX12-NEXT: v_dual_mov_b32 v30, s8 :: v_dual_and_b32 v29, 0xffff, v10
+; GFX12-NEXT: v_dual_mov_b32 v30, s12 :: v_dual_and_b32 v29, 0xffff, v10
; GFX12-NEXT: v_dual_mov_b32 v24, s21 :: v_dual_and_b32 v9, 0xffff, v9
-; GFX12-NEXT: v_dual_mov_b32 v10, s9 :: v_dual_mov_b32 v11, s15
-; GFX12-NEXT: v_mov_b32_e32 v26, s7
-; GFX12-NEXT: s_lshr_b32 s12, s6, 24
-; GFX12-NEXT: s_and_b32 s20, s6, 0xff
-; GFX12-NEXT: s_bfe_u32 s6, s6, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v11, s7
+; GFX12-NEXT: v_mov_b32_e32 v26, s11
+; GFX12-NEXT: s_lshr_b32 s2, s10, 24
+; GFX12-NEXT: s_and_b32 s20, s10, 0xff
+; GFX12-NEXT: s_bfe_u32 s10, s10, 0x80010
; GFX12-NEXT: v_dual_mov_b32 v4, s24 :: v_dual_and_b32 v17, 0xffff, v14
-; GFX12-NEXT: v_dual_mov_b32 v6, s10 :: v_dual_and_b32 v21, 0xffff, v12
-; GFX12-NEXT: v_dual_mov_b32 v31, s14 :: v_dual_mov_b32 v20, s20
-; GFX12-NEXT: s_lshr_b32 s3, s5, 24
-; GFX12-NEXT: s_and_b32 s19, s5, 0xff
-; GFX12-NEXT: s_bfe_u32 s5, s5, 0x80010
-; GFX12-NEXT: v_dual_mov_b32 v27, s13 :: v_dual_mov_b32 v22, s6
-; GFX12-NEXT: s_lshr_b32 s2, s4, 24
-; GFX12-NEXT: s_and_b32 s18, s4, 0xff
-; GFX12-NEXT: s_bfe_u32 s4, s4, 0x80010
-; GFX12-NEXT: v_dual_mov_b32 v23, s12 :: v_dual_mov_b32 v16, s19
-; GFX12-NEXT: v_dual_mov_b32 v18, s5 :: v_dual_mov_b32 v19, s3
+; GFX12-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_and_b32 v21, 0xffff, v12
+; GFX12-NEXT: v_dual_mov_b32 v31, s6 :: v_dual_mov_b32 v20, s20
+; GFX12-NEXT: s_lshr_b32 s1, s9, 24
+; GFX12-NEXT: s_and_b32 s19, s9, 0xff
+; GFX12-NEXT: s_bfe_u32 s9, s9, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v27, s3 :: v_dual_mov_b32 v22, s10
+; GFX12-NEXT: s_lshr_b32 s0, s8, 24
+; GFX12-NEXT: s_and_b32 s18, s8, 0xff
+; GFX12-NEXT: s_bfe_u32 s8, s8, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v23, s2 :: v_dual_mov_b32 v16, s19
+; GFX12-NEXT: v_dual_mov_b32 v18, s9 :: v_dual_mov_b32 v19, s1
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v32, v[0:3], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:96
-; GFX12-NEXT: v_dual_mov_b32 v12, s18 :: v_dual_mov_b32 v15, s2
-; GFX12-NEXT: v_mov_b32_e32 v14, s4
+; GFX12-NEXT: global_store_b128 v32, v[0:3], s[4:5] offset:112
+; GFX12-NEXT: global_store_b128 v32, v[4:7], s[4:5] offset:96
+; GFX12-NEXT: v_dual_mov_b32 v12, s18 :: v_dual_mov_b32 v15, s0
+; GFX12-NEXT: v_mov_b32_e32 v14, s8
; GFX12-NEXT: s_clause 0x5
-; GFX12-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:80
-; GFX12-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:64
-; GFX12-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v32, v[12:15], s[0:1]
+; GFX12-NEXT: global_store_b128 v32, v[8:11], s[4:5] offset:80
+; GFX12-NEXT: global_store_b128 v32, v[28:31], s[4:5] offset:64
+; GFX12-NEXT: global_store_b128 v32, v[24:27], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v32, v[20:23], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v32, v[16:19], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v32, v[12:15], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3231,78 +3231,78 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_sextload_v32i8_to_v32i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b256 s[4:11], s[2:3], 0x0
+; GFX12-NEXT: s_load_b256 s[8:15], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v1, 8, s11
-; GFX12-NEXT: v_lshrrev_b16 v9, 8, s9
-; GFX12-NEXT: v_lshrrev_b16 v10, 8, s8
-; GFX12-NEXT: v_lshrrev_b16 v11, 8, s7
-; GFX12-NEXT: v_lshrrev_b16 v5, 8, s10
-; GFX12-NEXT: v_lshrrev_b16 v12, 8, s6
-; GFX12-NEXT: s_ashr_i32 s20, s9, 24
-; GFX12-NEXT: s_bfe_i32 s21, s9, 0x80010
-; GFX12-NEXT: s_sext_i32_i8 s9, s9
-; GFX12-NEXT: s_ashr_i32 s24, s11, 24
-; GFX12-NEXT: s_sext_i32_i8 s25, s11
-; GFX12-NEXT: s_bfe_i32 s11, s11, 0x80010
-; GFX12-NEXT: v_lshrrev_b16 v14, 8, s5
-; GFX12-NEXT: s_ashr_i32 s18, s8, 24
-; GFX12-NEXT: s_bfe_i32 s19, s8, 0x80010
-; GFX12-NEXT: s_sext_i32_i8 s8, s8
-; GFX12-NEXT: s_ashr_i32 s22, s10, 24
-; GFX12-NEXT: s_bfe_i32 s23, s10, 0x80010
-; GFX12-NEXT: s_sext_i32_i8 s10, s10
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s15
+; GFX12-NEXT: v_lshrrev_b16 v9, 8, s13
+; GFX12-NEXT: v_lshrrev_b16 v10, 8, s12
+; GFX12-NEXT: v_lshrrev_b16 v11, 8, s11
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s14
+; GFX12-NEXT: v_lshrrev_b16 v12, 8, s10
+; GFX12-NEXT: s_ashr_i32 s20, s13, 24
+; GFX12-NEXT: s_bfe_i32 s21, s13, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s13, s13
+; GFX12-NEXT: s_ashr_i32 s24, s15, 24
+; GFX12-NEXT: s_sext_i32_i8 s25, s15
+; GFX12-NEXT: s_bfe_i32 s15, s15, 0x80010
+; GFX12-NEXT: v_lshrrev_b16 v14, 8, s9
+; GFX12-NEXT: s_ashr_i32 s18, s12, 24
+; GFX12-NEXT: s_bfe_i32 s19, s12, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s12, s12
+; GFX12-NEXT: s_ashr_i32 s22, s14, 24
+; GFX12-NEXT: s_bfe_i32 s23, s14, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s14, s14
; GFX12-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v3, s24
-; GFX12-NEXT: v_lshrrev_b16 v13, 8, s4
-; GFX12-NEXT: s_ashr_i32 s12, s5, 24
-; GFX12-NEXT: s_ashr_i32 s14, s6, 24
-; GFX12-NEXT: s_ashr_i32 s16, s7, 24
-; GFX12-NEXT: s_bfe_i32 s17, s7, 0x80010
-; GFX12-NEXT: s_sext_i32_i8 s7, s7
+; GFX12-NEXT: v_lshrrev_b16 v13, 8, s8
+; GFX12-NEXT: s_ashr_i32 s0, s8, 24
+; GFX12-NEXT: s_bfe_i32 s1, s8, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s2, s8
+; GFX12-NEXT: s_ashr_i32 s3, s9, 24
+; GFX12-NEXT: s_ashr_i32 s8, s10, 24
+; GFX12-NEXT: s_ashr_i32 s16, s11, 24
+; GFX12-NEXT: s_bfe_i32 s17, s11, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s11, s11
; GFX12-NEXT: v_dual_mov_b32 v0, s25 :: v_dual_mov_b32 v7, s22
-; GFX12-NEXT: v_mov_b32_e32 v2, s11
-; GFX12-NEXT: v_dual_mov_b32 v8, s9 :: v_dual_mov_b32 v23, s14
+; GFX12-NEXT: v_mov_b32_e32 v2, s15
+; GFX12-NEXT: v_dual_mov_b32 v8, s13 :: v_dual_mov_b32 v23, s8
; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX12-NEXT: v_bfe_i32 v25, v11, 0, 8
; GFX12-NEXT: v_bfe_i32 v29, v10, 0, 8
; GFX12-NEXT: v_bfe_i32 v9, v9, 0, 8
-; GFX12-NEXT: v_dual_mov_b32 v10, s21 :: v_dual_mov_b32 v19, s12
+; GFX12-NEXT: v_dual_mov_b32 v10, s21 :: v_dual_mov_b32 v19, s3
; GFX12-NEXT: v_mov_b32_e32 v11, s20
-; GFX12-NEXT: s_ashr_i32 s2, s4, 24
-; GFX12-NEXT: s_bfe_i32 s15, s6, 0x80010
-; GFX12-NEXT: s_sext_i32_i8 s6, s6
-; GFX12-NEXT: v_dual_mov_b32 v4, s10 :: v_dual_mov_b32 v31, s18
+; GFX12-NEXT: s_bfe_i32 s6, s9, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s7, s9
+; GFX12-NEXT: s_bfe_i32 s9, s10, 0x80010
+; GFX12-NEXT: s_sext_i32_i8 s10, s10
+; GFX12-NEXT: v_dual_mov_b32 v4, s14 :: v_dual_mov_b32 v31, s18
; GFX12-NEXT: v_dual_mov_b32 v6, s23 :: v_dual_mov_b32 v27, s16
; GFX12-NEXT: v_bfe_i32 v5, v5, 0, 8
-; GFX12-NEXT: v_dual_mov_b32 v28, s8 :: v_dual_mov_b32 v15, s2
+; GFX12-NEXT: v_dual_mov_b32 v28, s12 :: v_dual_mov_b32 v15, s0
; GFX12-NEXT: v_mov_b32_e32 v30, s19
-; GFX12-NEXT: s_bfe_i32 s13, s5, 0x80010
-; GFX12-NEXT: s_sext_i32_i8 s5, s5
-; GFX12-NEXT: v_mov_b32_e32 v24, s7
+; GFX12-NEXT: v_mov_b32_e32 v24, s11
; GFX12-NEXT: v_mov_b32_e32 v26, s17
-; GFX12-NEXT: s_bfe_i32 s3, s4, 0x80010
-; GFX12-NEXT: s_sext_i32_i8 s4, s4
; GFX12-NEXT: v_bfe_i32 v21, v12, 0, 8
-; GFX12-NEXT: v_mov_b32_e32 v20, s6
-; GFX12-NEXT: v_mov_b32_e32 v22, s15
+; GFX12-NEXT: v_mov_b32_e32 v20, s10
+; GFX12-NEXT: v_mov_b32_e32 v22, s9
; GFX12-NEXT: v_bfe_i32 v17, v14, 0, 8
-; GFX12-NEXT: v_mov_b32_e32 v16, s5
-; GFX12-NEXT: v_mov_b32_e32 v18, s13
+; GFX12-NEXT: v_mov_b32_e32 v16, s7
+; GFX12-NEXT: v_mov_b32_e32 v18, s6
; GFX12-NEXT: v_bfe_i32 v13, v13, 0, 8
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v32, v[0:3], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:96
-; GFX12-NEXT: v_mov_b32_e32 v12, s4
-; GFX12-NEXT: v_mov_b32_e32 v14, s3
+; GFX12-NEXT: global_store_b128 v32, v[0:3], s[4:5] offset:112
+; GFX12-NEXT: global_store_b128 v32, v[4:7], s[4:5] offset:96
+; GFX12-NEXT: v_mov_b32_e32 v12, s2
+; GFX12-NEXT: v_mov_b32_e32 v14, s1
; GFX12-NEXT: s_clause 0x5
-; GFX12-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:80
-; GFX12-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:64
-; GFX12-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v32, v[12:15], s[0:1]
+; GFX12-NEXT: global_store_b128 v32, v[8:11], s[4:5] offset:80
+; GFX12-NEXT: global_store_b128 v32, v[28:31], s[4:5] offset:64
+; GFX12-NEXT: global_store_b128 v32, v[24:27], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v32, v[20:23], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v32, v[16:19], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v32, v[12:15], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5234,14 +5234,14 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt
;
; GFX8-NOHSA-LABEL: constant_zextload_i8_to_i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
@@ -5266,13 +5266,13 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt
;
; GFX12-LABEL: constant_zextload_i8_to_i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v1, s[2:3]
+; GFX12-NEXT: global_load_u8 v0, v1, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v1, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5318,13 +5318,13 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt
;
; GFX8-NOHSA-LABEL: constant_sextload_i8_to_i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_sbyte v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: v_bfe_i32 v2, v2, 0, 16
; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2
@@ -5352,15 +5352,15 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt
;
; GFX12-LABEL: constant_sextload_i8_to_i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_i8 v0, v2, s[2:3]
+; GFX12-NEXT: global_load_i8 v0, v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5405,13 +5405,13 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_zextload_v1i8_to_v1i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ubyte v0, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -5436,12 +5436,12 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v1i8_to_v1i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_u8 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
+; GFX12-NEXT: global_store_b64 v1, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5487,13 +5487,13 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_sextload_v1i8_to_v1i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_sbyte v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: v_bfe_i32 v2, v2, 0, 16
; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2
@@ -5521,15 +5521,15 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v1i8_to_v1i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_i8 v0, v2, s[2:3]
+; GFX12-NEXT: global_load_i8 v0, v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5580,14 +5580,14 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_zextload_v2i8_to_v2i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ushort v2, v[0:1]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: v_and_b32_e32 v0, 0xff, v2
@@ -5627,16 +5627,16 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v2i8_to_v2i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v0, v1, s[2:3]
+; GFX12-NEXT: global_load_u16 v0, v1, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v2, 8, v0
; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xff, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5689,13 +5689,13 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_sextload_v2i8_to_v2i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ushort v0, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: v_lshrrev_b16_e32 v2, 8, v0
; GFX8-NOHSA-NEXT: v_bfe_i32 v0, v0, 0, 8
@@ -5738,10 +5738,10 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v2i8_to_v2i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v0, v4, s[2:3]
+; GFX12-NEXT: global_load_u16 v0, v4, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_lshrrev_b16 v1, 8, v0
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 8
@@ -5750,7 +5750,7 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out
; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5812,28 +5812,28 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_zextload_v4i8_to_v4i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s2, 24
-; GFX8-NOHSA-NEXT: s_and_b32 s4, s2, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s2
-; GFX8-NOHSA-NEXT: s_bfe_u32 s2, s2, 0x80010
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: s_lshr_b32 s1, s0, 24
+; GFX8-NOHSA-NEXT: s_and_b32 s2, s0, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s0
+; GFX8-NOHSA-NEXT: s_bfe_u32 s0, s0, 0x80010
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: constant_zextload_v4i8_to_v4i64:
@@ -5870,23 +5870,23 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v4i8_to_v4i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v4, 8, s2
-; GFX12-NEXT: s_bfe_u32 s3, s2, 0x80010
+; GFX12-NEXT: v_lshrrev_b16 v4, 8, s0
+; GFX12-NEXT: s_bfe_u32 s1, s0, 0x80010
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3
-; GFX12-NEXT: s_lshr_b32 s4, s2, 24
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s1
+; GFX12-NEXT: s_lshr_b32 s2, s0, 24
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT: s_and_b32 s2, s2, 0xff
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT: s_and_b32 s0, s0, 0xff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:16
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: v_mov_b32_e32 v2, v4
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -5958,31 +5958,31 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_sextload_v4i8_to_v4i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s2, 16
-; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s2, 24
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s2
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
+; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s0, 16
+; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s0, 24
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s0
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 16
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 16
; GFX8-NOHSA-NEXT: v_bfe_i32 v2, v0, 0, 8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s7
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -6021,26 +6021,26 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v4i8_to_v4i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2
-; GFX12-NEXT: s_lshr_b32 s4, s2, 16
-; GFX12-NEXT: s_lshr_b32 s6, s2, 24
-; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s0
+; GFX12-NEXT: s_lshr_b32 s2, s0, 16
+; GFX12-NEXT: s_lshr_b32 s6, s0, 24
+; GFX12-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x80000
; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
; GFX12-NEXT: v_bfe_i32 v2, v1, 0, 8
-; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v5, s5
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v7, s7
-; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v5, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v7, s7
+; GFX12-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: v_mov_b32_e32 v6, s6
; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6132,40 +6132,40 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_zextload_v8i8_to_v8i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX8-NOHSA-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s3, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s5, s2, 24
-; GFX8-NOHSA-NEXT: s_and_b32 s6, s3, 0xff
-; GFX8-NOHSA-NEXT: s_and_b32 s7, s2, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s2
-; GFX8-NOHSA-NEXT: s_bfe_u32 s8, s2, 0x80010
-; GFX8-NOHSA-NEXT: s_bfe_u32 s2, s3, 0x80010
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s3
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s4
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s1, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s0, 24
+; GFX8-NOHSA-NEXT: s_and_b32 s6, s1, 0xff
+; GFX8-NOHSA-NEXT: s_and_b32 s7, s0, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s0
+; GFX8-NOHSA-NEXT: s_bfe_u32 s8, s0, 0x80010
+; GFX8-NOHSA-NEXT: s_bfe_u32 s0, s1, 0x80010
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 48
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s1
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 32
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s7
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v7
-; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
@@ -6227,34 +6227,34 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v8i8_to_v8i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_bfe_u32 s4, s3, 0x80010
+; GFX12-NEXT: s_bfe_u32 s2, s1, 0x80010
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
-; GFX12-NEXT: s_lshr_b32 s5, s3, 24
-; GFX12-NEXT: v_lshrrev_b16 v4, 8, s2
-; GFX12-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT: s_lshr_b32 s4, s2, 24
-; GFX12-NEXT: s_bfe_u32 s5, s2, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-NEXT: s_lshr_b32 s3, s1, 24
+; GFX12-NEXT: v_lshrrev_b16 v4, 8, s0
+; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT: s_lshr_b32 s2, s0, 24
+; GFX12-NEXT: s_bfe_u32 s3, s0, 0x80010
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX12-NEXT: v_lshrrev_b16 v5, 8, s3
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
-; GFX12-NEXT: v_mov_b32_e32 v0, s5
-; GFX12-NEXT: v_mov_b32_e32 v2, s4
-; GFX12-NEXT: s_and_b32 s2, s2, 0xff
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s1
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:48
+; GFX12-NEXT: v_mov_b32_e32 v0, s3
+; GFX12-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-NEXT: s_and_b32 s0, s0, 0xff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:16
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: v_mov_b32_e32 v2, v4
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5
-; GFX12-NEXT: s_and_b32 s2, s3, 0xff
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: s_and_b32 s0, s1, 0xff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5]
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: v_mov_b32_e32 v2, v4
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:32
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6368,55 +6368,55 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_sextload_v8i8_to_v8i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NOHSA-NEXT: s_mov_b32 s5, 0
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_mov_b32 s3, 0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX8-NOHSA-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s3, 16
-; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s2, 16
-; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s2, 24
-; GFX8-NOHSA-NEXT: s_mov_b32 s4, s3
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s2
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s3
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x80000
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
+; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s1, 16
+; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s0, 16
+; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s0, 24
+; GFX8-NOHSA-NEXT: s_mov_b32 s2, s1
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s0
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s1
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[0:1], 0x80000
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
-; GFX8-NOHSA-NEXT: s_ashr_i64 s[2:3], s[2:3], 56
+; GFX8-NOHSA-NEXT: s_ashr_i64 s[0:1], s[0:1], 56
; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 48
; GFX8-NOHSA-NEXT: v_bfe_i32 v2, v1, 0, 8
; GFX8-NOHSA-NEXT: v_bfe_i32 v6, v0, 0, 8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s1
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s7
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 16
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s9
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s10
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s11
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NOHSA-NEXT: s_add_u32 s0, s0, 32
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 32
; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s12
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s13
-; GFX8-NOHSA-NEXT: s_addc_u32 s1, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
@@ -6479,40 +6479,40 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v8i8_to_v8i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: s_mov_b32 s5, 0
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s3, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v6, 8, s2
-; GFX12-NEXT: v_lshrrev_b16 v7, 8, s3
-; GFX12-NEXT: s_lshr_b32 s6, s3, 16
-; GFX12-NEXT: s_lshr_b32 s8, s2, 16
-; GFX12-NEXT: s_lshr_b32 s10, s2, 24
+; GFX12-NEXT: v_lshrrev_b16 v6, 8, s0
+; GFX12-NEXT: v_lshrrev_b16 v7, 8, s1
+; GFX12-NEXT: s_lshr_b32 s6, s1, 16
+; GFX12-NEXT: s_lshr_b32 s8, s0, 16
+; GFX12-NEXT: s_lshr_b32 s10, s0, 24
; GFX12-NEXT: v_bfe_i32 v6, v6, 0, 8
-; GFX12-NEXT: s_mov_b32 s4, s3
+; GFX12-NEXT: s_mov_b32 s2, s1
; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
; GFX12-NEXT: v_bfe_i32 v14, v7, 0, 8
-; GFX12-NEXT: s_bfe_i64 s[12:13], s[2:3], 0x80000
-; GFX12-NEXT: s_ashr_i64 s[2:3], s[2:3], 56
+; GFX12-NEXT: s_bfe_i64 s[12:13], s[0:1], 0x80000
+; GFX12-NEXT: s_ashr_i64 s[0:1], s[0:1], 56
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
+; GFX12-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v9, s9
; GFX12-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v11, s11
-; GFX12-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v13, s5
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s13
+; GFX12-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v13, s3
+; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v5, s13
; GFX12-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v1, s7
; GFX12-NEXT: v_ashrrev_i32_e32 v7, 31, v6
-; GFX12-NEXT: v_mov_b32_e32 v12, s4
+; GFX12-NEXT: v_mov_b32_e32 v12, s2
; GFX12-NEXT: v_ashrrev_i32_e32 v15, 31, v14
; GFX12-NEXT: s_clause 0x3
-; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v16, v[4:7], s[0:1]
-; GFX12-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1] offset:32
+; GFX12-NEXT: global_store_b128 v16, v[8:11], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v16, v[4:7], s[4:5]
+; GFX12-NEXT: global_store_b128 v16, v[0:3], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v16, v[12:15], s[4:5] offset:32
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -6664,82 +6664,82 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o
;
; GFX8-NOHSA-LABEL: constant_zextload_v16i8_to_v16i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s5, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s7, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s9, s6, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s4, 24
-; GFX8-NOHSA-NEXT: s_and_b32 s11, s4, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s4
-; GFX8-NOHSA-NEXT: s_and_b32 s12, s5, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s5
-; GFX8-NOHSA-NEXT: s_and_b32 s13, s7, 0xff
-; GFX8-NOHSA-NEXT: s_and_b32 s14, s6, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v9, 8, s6
-; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s4, 0x80010
-; GFX8-NOHSA-NEXT: s_bfe_u32 s6, s6, 0x80010
-; GFX8-NOHSA-NEXT: s_bfe_u32 s5, s5, 0x80010
-; GFX8-NOHSA-NEXT: s_bfe_u32 s3, s7, 0x80010
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x70
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 48
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x50
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s8
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s1, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s7, s3, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s2, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s9, s0, 24
+; GFX8-NOHSA-NEXT: s_and_b32 s10, s0, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s0
+; GFX8-NOHSA-NEXT: s_and_b32 s11, s1, 0xff
+; GFX8-NOHSA-NEXT: s_and_b32 s12, s3, 0xff
+; GFX8-NOHSA-NEXT: s_and_b32 s13, s2, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v9, 8, s2
+; GFX8-NOHSA-NEXT: s_bfe_u32 s14, s0, 0x80010
+; GFX8-NOHSA-NEXT: s_bfe_u32 s2, s2, 0x80010
+; GFX8-NOHSA-NEXT: s_bfe_u32 s15, s1, 0x80010
+; GFX8-NOHSA-NEXT: s_bfe_u32 s0, s3, 0x80010
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 0x70
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s1
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 48
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s9
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 0x50
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s15
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 64
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s8
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 0x60
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 64
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s14
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v9
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s9
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v8, 8, s7
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 32
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 0x60
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s13
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v8
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v9
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v8, 8, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 32
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v7
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v8
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s11
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v7
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: constant_zextload_v16i8_to_v16i64:
@@ -6833,55 +6833,55 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_zextload_v16i8_to_v16i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_bfe_u32 s2, s7, 0x80010
+; GFX12-NEXT: s_bfe_u32 s6, s3, 0x80010
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT: s_lshr_b32 s3, s7, 24
-; GFX12-NEXT: s_lshr_b32 s2, s5, 24
-; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT: s_bfe_u32 s3, s5, 0x80010
-; GFX12-NEXT: v_lshrrev_b16 v4, 8, s6
-; GFX12-NEXT: v_lshrrev_b16 v5, 8, s7
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:112
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-NEXT: s_lshr_b32 s2, s6, 24
-; GFX12-NEXT: s_bfe_u32 s3, s6, 0x80010
+; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-NEXT: s_lshr_b32 s7, s3, 24
+; GFX12-NEXT: s_lshr_b32 s6, s1, 24
+; GFX12-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT: s_bfe_u32 s7, s1, 0x80010
+; GFX12-NEXT: v_lshrrev_b16 v4, 8, s2
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s3
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:112
+; GFX12-NEXT: v_mov_b32_e32 v0, s7
+; GFX12-NEXT: v_mov_b32_e32 v2, s6
+; GFX12-NEXT: s_lshr_b32 s6, s2, 24
+; GFX12-NEXT: s_bfe_u32 s7, s2, 0x80010
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-NEXT: s_lshr_b32 s2, s4, 24
-; GFX12-NEXT: s_bfe_u32 s3, s4, 0x80010
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:80
-; GFX12-NEXT: v_mov_b32_e32 v0, s3
-; GFX12-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-NEXT: s_and_b32 s2, s6, 0xff
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX12-NEXT: s_and_b32 s2, s2, 0xff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:48
+; GFX12-NEXT: v_mov_b32_e32 v0, s7
+; GFX12-NEXT: v_mov_b32_e32 v2, s6
+; GFX12-NEXT: s_lshr_b32 s6, s0, 24
+; GFX12-NEXT: s_bfe_u32 s7, s0, 0x80010
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:80
+; GFX12-NEXT: v_mov_b32_e32 v0, s7
+; GFX12-NEXT: v_mov_b32_e32 v2, s6
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:16
; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5
-; GFX12-NEXT: v_lshrrev_b16 v5, 8, s5
-; GFX12-NEXT: s_and_b32 s2, s7, 0xff
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:64
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s1
+; GFX12-NEXT: s_and_b32 s2, s3, 0xff
+; GFX12-NEXT: s_and_b32 s1, s1, 0xff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:64
; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: v_mov_b32_e32 v2, v4
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5
-; GFX12-NEXT: v_lshrrev_b16 v5, 8, s4
-; GFX12-NEXT: s_and_b32 s2, s5, 0xff
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:96
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: v_lshrrev_b16 v5, 8, s0
+; GFX12-NEXT: s_and_b32 s0, s0, 0xff
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:96
+; GFX12-NEXT: v_mov_b32_e32 v0, s1
; GFX12-NEXT: v_mov_b32_e32 v2, v4
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v5
-; GFX12-NEXT: s_and_b32 s2, s4, 0xff
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5] offset:32
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
; GFX12-NEXT: v_mov_b32_e32 v2, v4
-; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v1, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -7081,9 +7081,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
;
; GFX8-NOHSA-LABEL: constant_sextload_v16i8_to_v16i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[2:3], 0x0
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s11, 16
; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s10, 16
@@ -7092,16 +7092,16 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s8, 16
; GFX8-NOHSA-NEXT: s_lshr_b32 s22, s8, 24
; GFX8-NOHSA-NEXT: s_mov_b32 s24, s11
-; GFX8-NOHSA-NEXT: s_mov_b32 s4, s9
+; GFX8-NOHSA-NEXT: s_mov_b32 s2, s9
; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s11
; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v1, 8, s10
; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v4, 8, s9
; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s8
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[8:9], 0x80000
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[0:1], s[8:9], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[6:7], s[10:11], 0x80000
; GFX8-NOHSA-NEXT: s_ashr_i64 s[8:9], s[8:9], 56
; GFX8-NOHSA-NEXT: s_ashr_i64 s[10:11], s[10:11], 56
-; GFX8-NOHSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
+; GFX8-NOHSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
@@ -7110,18 +7110,18 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
; GFX8-NOHSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v18, s10
-; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 0x70
+; GFX8-NOHSA-NEXT: s_add_u32 s10, s4, 0x70
; GFX8-NOHSA-NEXT: v_bfe_i32 v10, v1, 0, 8
; GFX8-NOHSA-NEXT: v_bfe_i32 v14, v0, 0, 8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v19, s11
-; GFX8-NOHSA-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NOHSA-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s12
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s13
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s11
-; GFX8-NOHSA-NEXT: s_add_u32 s10, s0, 0x50
+; GFX8-NOHSA-NEXT: s_add_u32 s10, s4, 0x50
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19]
-; GFX8-NOHSA-NEXT: s_addc_u32 s11, s1, 0
+; GFX8-NOHSA-NEXT: s_addc_u32 s11, s5, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s14
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s15
@@ -7131,53 +7131,53 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19]
; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v18, s8
-; GFX8-NOHSA-NEXT: s_add_u32 s8, s0, 48
+; GFX8-NOHSA-NEXT: s_add_u32 s8, s4, 48
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v19, s9
-; GFX8-NOHSA-NEXT: s_addc_u32 s9, s1, 0
+; GFX8-NOHSA-NEXT: s_addc_u32 s9, s5, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s18
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s19
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9
-; GFX8-NOHSA-NEXT: s_add_u32 s8, s0, 16
+; GFX8-NOHSA-NEXT: s_add_u32 s8, s4, 16
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19]
-; GFX8-NOHSA-NEXT: s_addc_u32 s9, s1, 0
+; GFX8-NOHSA-NEXT: s_addc_u32 s9, s5, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v16, s20
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v17, s21
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v18, s22
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v19, s23
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9
-; GFX8-NOHSA-NEXT: s_add_u32 s8, s0, 0x60
+; GFX8-NOHSA-NEXT: s_add_u32 s8, s4, 0x60
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19]
-; GFX8-NOHSA-NEXT: s_addc_u32 s9, s1, 0
+; GFX8-NOHSA-NEXT: s_addc_u32 s9, s5, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v12, s24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v13, s25
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s9
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s6
-; GFX8-NOHSA-NEXT: s_add_u32 s6, s0, 64
+; GFX8-NOHSA-NEXT: s_add_u32 s6, s4, 64
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s7
-; GFX8-NOHSA-NEXT: s_addc_u32 s7, s1, 0
+; GFX8-NOHSA-NEXT: s_addc_u32 s7, s5, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
; GFX8-NOHSA-NEXT: v_bfe_i32 v6, v4, 0, 8
; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 32
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 32
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NOHSA-NEXT: v_bfe_i32 v2, v2, 0, 8
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GFX8-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -7282,44 +7282,44 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_sextload_v16i8_to_v16i64:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[8:11], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_lshrrev_b16 v10, 8, s7
-; GFX12-NEXT: v_lshrrev_b16 v11, 8, s6
-; GFX12-NEXT: v_lshrrev_b16 v21, 8, s5
-; GFX12-NEXT: v_lshrrev_b16 v23, 8, s4
-; GFX12-NEXT: s_lshr_b32 s8, s7, 16
-; GFX12-NEXT: s_lshr_b32 s10, s6, 16
-; GFX12-NEXT: s_lshr_b32 s12, s6, 24
+; GFX12-NEXT: v_lshrrev_b16 v10, 8, s11
+; GFX12-NEXT: v_lshrrev_b16 v11, 8, s10
+; GFX12-NEXT: v_lshrrev_b16 v21, 8, s9
+; GFX12-NEXT: v_lshrrev_b16 v23, 8, s8
+; GFX12-NEXT: s_lshr_b32 s2, s11, 16
+; GFX12-NEXT: s_lshr_b32 s6, s10, 16
+; GFX12-NEXT: s_lshr_b32 s12, s10, 24
; GFX12-NEXT: v_bfe_i32 v22, v10, 0, 8
; GFX12-NEXT: v_bfe_i32 v10, v11, 0, 8
-; GFX12-NEXT: s_lshr_b32 s18, s4, 24
-; GFX12-NEXT: s_mov_b32 s20, s7
-; GFX12-NEXT: s_lshr_b32 s14, s5, 16
-; GFX12-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x80000
-; GFX12-NEXT: s_ashr_i64 s[6:7], s[6:7], 56
+; GFX12-NEXT: s_lshr_b32 s18, s8, 24
+; GFX12-NEXT: s_mov_b32 s20, s11
+; GFX12-NEXT: s_lshr_b32 s14, s9, 16
+; GFX12-NEXT: s_bfe_i64 s[24:25], s[10:11], 0x80000
+; GFX12-NEXT: s_ashr_i64 s[10:11], s[10:11], 56
; GFX12-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
-; GFX12-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
+; GFX12-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
; GFX12-NEXT: v_bfe_i32 v28, v21, 0, 8
-; GFX12-NEXT: s_lshr_b32 s16, s4, 16
-; GFX12-NEXT: s_mov_b32 s22, s5
-; GFX12-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000
-; GFX12-NEXT: s_ashr_i64 s[4:5], s[4:5], 56
+; GFX12-NEXT: s_lshr_b32 s16, s8, 16
+; GFX12-NEXT: s_mov_b32 s22, s9
+; GFX12-NEXT: s_bfe_i64 s[0:1], s[8:9], 0x80000
+; GFX12-NEXT: s_ashr_i64 s[8:9], s[8:9], 56
; GFX12-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
; GFX12-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v30, 0 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: v_dual_mov_b32 v30, 0 :: v_dual_mov_b32 v3, s11
; GFX12-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v7, s5
-; GFX12-NEXT: v_dual_mov_b32 v8, s24 :: v_dual_mov_b32 v1, s9
-; GFX12-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v13, s11
-; GFX12-NEXT: v_dual_mov_b32 v12, s10 :: v_dual_mov_b32 v15, s13
+; GFX12-NEXT: v_dual_mov_b32 v2, s10 :: v_dual_mov_b32 v7, s9
+; GFX12-NEXT: v_dual_mov_b32 v8, s24 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v13, s7
+; GFX12-NEXT: v_dual_mov_b32 v12, s6 :: v_dual_mov_b32 v15, s13
; GFX12-NEXT: v_dual_mov_b32 v14, s12 :: v_dual_mov_b32 v5, s15
; GFX12-NEXT: v_bfe_i32 v24, v23, 0, 8
; GFX12-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
-; GFX12-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v9, s25
+; GFX12-NEXT: v_dual_mov_b32 v6, s8 :: v_dual_mov_b32 v9, s25
; GFX12-NEXT: v_dual_mov_b32 v18, s18 :: v_dual_mov_b32 v21, s21
; GFX12-NEXT: v_dual_mov_b32 v20, s20 :: v_dual_mov_b32 v27, s23
; GFX12-NEXT: v_ashrrev_i32_e32 v23, 31, v22
@@ -7332,16 +7332,16 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX12-NEXT: v_dual_mov_b32 v16, s16 :: v_dual_mov_b32 v19, s19
; GFX12-NEXT: v_ashrrev_i32_e32 v25, 31, v24
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v30, v[0:3], s[0:1] offset:112
-; GFX12-NEXT: global_store_b128 v30, v[20:23], s[0:1] offset:96
-; GFX12-NEXT: v_dual_mov_b32 v22, s2 :: v_dual_mov_b32 v23, s3
+; GFX12-NEXT: global_store_b128 v30, v[0:3], s[4:5] offset:112
+; GFX12-NEXT: global_store_b128 v30, v[20:23], s[4:5] offset:96
+; GFX12-NEXT: v_dual_mov_b32 v22, s0 :: v_dual_mov_b32 v23, s1
; GFX12-NEXT: s_clause 0x5
-; GFX12-NEXT: global_store_b128 v30, v[12:15], s[0:1] offset:80
-; GFX12-NEXT: global_store_b128 v30, v[8:11], s[0:1] offset:64
-; GFX12-NEXT: global_store_b128 v30, v[4:7], s[0:1] offset:48
-; GFX12-NEXT: global_store_b128 v30, v[26:29], s[0:1] offset:32
-; GFX12-NEXT: global_store_b128 v30, v[16:19], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v30, v[22:25], s[0:1]
+; GFX12-NEXT: global_store_b128 v30, v[12:15], s[4:5] offset:80
+; GFX12-NEXT: global_store_b128 v30, v[8:11], s[4:5] offset:64
+; GFX12-NEXT: global_store_b128 v30, v[4:7], s[4:5] offset:48
+; GFX12-NEXT: global_store_b128 v30, v[26:29], s[4:5] offset:32
+; GFX12-NEXT: global_store_b128 v30, v[16:19], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v30, v[22:25], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -7613,159 +7613,159 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
;
; GFX8-NOHSA-LABEL: constant_zextload_v32i8_to_v32i64:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, v1
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
+; GFX8-NOHSA-NEXT: s_load_dwordx8 s[8:15], s[6:7], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s5, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s13, s7, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s14, s9, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s15, s11, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s10, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s17, s8, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s18, s6, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s19, s4, 24
-; GFX8-NOHSA-NEXT: s_and_b32 s2, s4, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v4, 8, s4
-; GFX8-NOHSA-NEXT: s_and_b32 s3, s5, 0xff
-; GFX8-NOHSA-NEXT: s_and_b32 s20, s6, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s6
-; GFX8-NOHSA-NEXT: s_and_b32 s21, s7, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s7
-; GFX8-NOHSA-NEXT: s_and_b32 s22, s8, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v8, 8, s8
-; GFX8-NOHSA-NEXT: s_and_b32 s23, s9, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v9, 8, s9
-; GFX8-NOHSA-NEXT: s_and_b32 s24, s10, 0xff
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v12, 8, s10
-; GFX8-NOHSA-NEXT: s_and_b32 s25, s11, 0xff
-; GFX8-NOHSA-NEXT: s_bfe_u32 s26, s4, 0x80010
-; GFX8-NOHSA-NEXT: s_bfe_u32 s6, s6, 0x80010
+; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s9, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s7, s11, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s16, s13, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s15, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s17, s14, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s18, s12, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s19, s10, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s20, s8, 24
+; GFX8-NOHSA-NEXT: s_and_b32 s0, s8, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v4, 8, s8
+; GFX8-NOHSA-NEXT: s_and_b32 s1, s9, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v5, 8, s9
+; GFX8-NOHSA-NEXT: s_and_b32 s21, s10, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v6, 8, s10
+; GFX8-NOHSA-NEXT: s_and_b32 s22, s11, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v7, 8, s11
+; GFX8-NOHSA-NEXT: s_and_b32 s23, s12, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v8, 8, s12
+; GFX8-NOHSA-NEXT: s_and_b32 s24, s13, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v9, 8, s13
+; GFX8-NOHSA-NEXT: s_and_b32 s25, s14, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v12, 8, s14
+; GFX8-NOHSA-NEXT: s_and_b32 s26, s15, 0xff
; GFX8-NOHSA-NEXT: s_bfe_u32 s8, s8, 0x80010
; GFX8-NOHSA-NEXT: s_bfe_u32 s10, s10, 0x80010
-; GFX8-NOHSA-NEXT: s_bfe_u32 s27, s5, 0x80010
-; GFX8-NOHSA-NEXT: s_bfe_u32 s7, s7, 0x80010
+; GFX8-NOHSA-NEXT: s_bfe_u32 s12, s12, 0x80010
+; GFX8-NOHSA-NEXT: s_bfe_u32 s14, s14, 0x80010
; GFX8-NOHSA-NEXT: s_bfe_u32 s9, s9, 0x80010
-; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s11, 0x80010
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0xf0
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v5, 8, s5
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0xb0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s15
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
-; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0x70
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s9
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s14
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT: s_bfe_u32 s11, s11, 0x80010
+; GFX8-NOHSA-NEXT: s_bfe_u32 s13, s13, 0x80010
+; GFX8-NOHSA-NEXT: s_bfe_u32 s3, s15, 0x80010
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 0xf0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s3
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 0xb0
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 48
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s7
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s13
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 0x70
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s13
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s16
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0xd0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s27
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 48
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s11
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0x90
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s16
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 0xd0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s9
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s6
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0x50
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 0x90
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s14
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s17
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 16
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 0x50
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s12
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s18
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0xe0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s26
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 16
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s10
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s19
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v13, 8, s11
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0xc0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s25
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 0xe0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s8
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s20
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
+; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v13, 8, s15
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 0xc0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s26
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v13
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0xa0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s24
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 0xa0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s25
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v12
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v9
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0x80
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s23
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 0x80
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s24
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 0x60
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s22
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 0x60
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s23
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v7
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 64
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s21
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 64
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s22
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[7:8], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v6
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s4
-; GFX8-NOHSA-NEXT: s_add_u32 s4, s0, 32
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s20
-; GFX8-NOHSA-NEXT: s_addc_u32 s5, s1, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2
+; GFX8-NOHSA-NEXT: s_add_u32 s2, s4, 32
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s21
+; GFX8-NOHSA-NEXT: s_addc_u32 s3, s5, 0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s2
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3]
; GFX8-NOHSA-NEXT: s_nop 0
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, v4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -8970,13 +8970,13 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt
;
; GFX8-NOHSA-LABEL: constant_zextload_i8_to_i16:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: flat_store_short v[0:1], v2
; GFX8-NOHSA-NEXT: s_endpgm
@@ -9008,12 +9008,12 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt
;
; GFX12-LABEL: constant_zextload_i8_to_i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u8 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -9056,13 +9056,13 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt
;
; GFX8-NOHSA-LABEL: constant_sextload_i8_to_i16:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_sbyte v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: flat_store_short v[0:1], v2
; GFX8-NOHSA-NEXT: s_endpgm
@@ -9096,12 +9096,12 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt
;
; GFX12-LABEL: constant_sextload_i8_to_i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_i8 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_i8 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -9144,13 +9144,13 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_zextload_v1i8_to_v1i16:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: flat_store_short v[0:1], v2
; GFX8-NOHSA-NEXT: s_endpgm
@@ -9182,12 +9182,12 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v1i8_to_v1i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u8 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -9230,13 +9230,13 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_sextload_v1i8_to_v1i16:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_sbyte v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: flat_store_short v[0:1], v2
; GFX8-NOHSA-NEXT: s_endpgm
@@ -9270,12 +9270,12 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v1i8_to_v1i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_i8 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_i8 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -9324,14 +9324,14 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_zextload_v2i8_to_v2i16:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ushort v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: v_lshrrev_b16_e32 v4, 8, v2
; GFX8-NOHSA-NEXT: v_and_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
@@ -9364,17 +9364,17 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v2i8_to_v2i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_and_b32_e32 v2, 0xff, v1
; GFX12-NEXT: v_lshrrev_b16 v1, 8, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX12-NEXT: v_lshl_or_b32 v1, v1, 16, v2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -9427,14 +9427,14 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_sextload_v2i8_to_v2i16:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NOHSA-NEXT: flat_load_ushort v2, v[0:1]
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GFX8-NOHSA-NEXT: v_and_b32_sdwa v3, v3, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NOHSA-NEXT: v_ashrrev_i16_e32 v2, 8, v2
@@ -9477,17 +9477,17 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v2i8_to_v2i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_bfe_i32 v2, v1, 0, 8
; GFX12-NEXT: v_ashrrev_i16 v1, 8, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX12-NEXT: v_lshl_or_b32 v1, v1, 16, v2
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -9540,20 +9540,20 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_zextload_v4i8_to_v4i16:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s2
-; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 24
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s2
-; GFX8-NOHSA-NEXT: s_and_b32 s1, s2, 0xff
-; GFX8-NOHSA-NEXT: v_alignbit_b32 v3, s0, v3, 16
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s0
+; GFX8-NOHSA-NEXT: s_lshr_b32 s1, s0, 24
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xff
+; GFX8-NOHSA-NEXT: v_alignbit_b32 v3, s1, v3, 16
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v3
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s1, v2
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s0, v2
; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -9606,23 +9606,23 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v4i8_to_v4i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v3, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s3, s2, 16
-; GFX12-NEXT: v_and_b32_e64 v0, 0xff, s2
-; GFX12-NEXT: v_and_b32_e64 v1, 0xff, s3
-; GFX12-NEXT: v_lshrrev_b16 v2, 8, s2
-; GFX12-NEXT: s_lshr_b32 s2, s2, 24
+; GFX12-NEXT: s_lshr_b32 s1, s0, 16
+; GFX12-NEXT: v_and_b32_e64 v0, 0xff, s0
+; GFX12-NEXT: v_and_b32_e64 v1, 0xff, s1
+; GFX12-NEXT: v_lshrrev_b16 v2, 8, s0
+; GFX12-NEXT: s_lshr_b32 s0, s0, 24
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_lshl_or_b32 v0, v2, 16, v0
-; GFX12-NEXT: v_lshl_or_b32 v1, s2, 16, v1
-; GFX12-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX12-NEXT: v_lshl_or_b32 v1, s0, 16, v1
+; GFX12-NEXT: global_store_b64 v3, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -9681,22 +9681,22 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_sextload_v4i8_to_v4i16:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dword s2, s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NOHSA-NEXT: s_load_dword s0, s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 16
-; GFX8-NOHSA-NEXT: s_ashr_i32 s1, s2, 24
-; GFX8-NOHSA-NEXT: s_bfe_i32 s0, s0, 0x80000
-; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s2, 0x80000
-; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v2, 8, s2
-; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s1, 16
-; GFX8-NOHSA-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX8-NOHSA-NEXT: s_lshr_b32 s1, s0, 16
+; GFX8-NOHSA-NEXT: s_ashr_i32 s2, s0, 24
+; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s1, 0x80000
+; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s0, 0x80000
+; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v2, 8, s0
+; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s2, 16
+; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NOHSA-NEXT: s_and_b32 s2, 0xffff, s3
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NOHSA-NEXT: s_or_b32 s0, s0, s1
+; GFX8-NOHSA-NEXT: s_or_b32 s0, s1, s0
; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s2, v2
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
@@ -9757,22 +9757,22 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v4i8_to_v4i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s0, s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_bfe_i32 s4, s2, 0x80000
-; GFX12-NEXT: s_lshr_b32 s3, s2, 16
-; GFX12-NEXT: v_ashrrev_i16 v0, 8, s2
-; GFX12-NEXT: v_and_b32_e64 v1, 0xffff, s4
-; GFX12-NEXT: s_ashr_i32 s2, s2, 24
-; GFX12-NEXT: s_bfe_i32 s3, s3, 0x80000
+; GFX12-NEXT: s_bfe_i32 s2, s0, 0x80000
+; GFX12-NEXT: s_lshr_b32 s1, s0, 16
+; GFX12-NEXT: v_ashrrev_i16 v0, 8, s0
+; GFX12-NEXT: v_and_b32_e64 v1, 0xffff, s2
+; GFX12-NEXT: s_ashr_i32 s0, s0, 24
+; GFX12-NEXT: s_bfe_i32 s1, s1, 0x80000
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_pack_ll_b32_b16 s2, s3, s2
+; GFX12-NEXT: s_pack_ll_b32_b16 s0, s1, s0
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: v_lshl_or_b32 v0, v0, 16, v1
-; GFX12-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_mov_b32_e32 v1, s0
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -9843,29 +9843,29 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_zextload_v8i8_to_v8i16:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 24
-; GFX8-NOHSA-NEXT: s_lshr_b32 s1, s3, 24
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v3, 8, s2
-; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s3, 0x80010
-; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s1, 16
-; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s0, v0, 16
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s3
-; GFX8-NOHSA-NEXT: s_or_b32 s0, s4, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s0, 24
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v3, 8, s0
+; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s1, 24
+; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s2, v0, 16
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s1
+; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s1, 0x80010
+; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s3, 16
; GFX8-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0
-; GFX8-NOHSA-NEXT: s_and_b32 s1, s2, 0xff
+; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xff
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v0, 16, v3
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v0, s1, v0
-; GFX8-NOHSA-NEXT: s_and_b32 s1, s3, 0xff
+; GFX8-NOHSA-NEXT: s_or_b32 s2, s4, s3
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX8-NOHSA-NEXT: s_and_b32 s0, s1, 0xff
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s1, v2
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s0, v2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -9948,30 +9948,30 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_zextload_v8i8_to_v8i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s5, s2, 16
-; GFX12-NEXT: s_lshr_b32 s6, s3, 16
-; GFX12-NEXT: v_and_b32_e64 v0, 0xff, s2
-; GFX12-NEXT: v_and_b32_e64 v2, 0xff, s3
+; GFX12-NEXT: s_lshr_b32 s3, s0, 16
+; GFX12-NEXT: s_lshr_b32 s6, s1, 16
+; GFX12-NEXT: v_and_b32_e64 v0, 0xff, s0
+; GFX12-NEXT: v_and_b32_e64 v2, 0xff, s1
; GFX12-NEXT: v_and_b32_e64 v3, 0xff, s6
-; GFX12-NEXT: v_and_b32_e64 v5, 0xff, s5
+; GFX12-NEXT: v_and_b32_e64 v5, 0xff, s3
; GFX12-NEXT: v_mov_b32_e32 v4, 0
-; GFX12-NEXT: v_lshrrev_b16 v1, 8, s3
-; GFX12-NEXT: v_lshrrev_b16 v6, 8, s2
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s1
+; GFX12-NEXT: v_lshrrev_b16 v6, 8, s0
; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: s_lshr_b32 s4, s2, 24
-; GFX12-NEXT: s_lshr_b32 s2, s3, 24
+; GFX12-NEXT: s_lshr_b32 s2, s0, 24
+; GFX12-NEXT: s_lshr_b32 s0, s1, 24
; GFX12-NEXT: v_lshl_or_b32 v0, v6, 16, v0
; GFX12-NEXT: v_lshl_or_b32 v2, v1, 16, v2
-; GFX12-NEXT: v_lshl_or_b32 v3, s2, 16, v3
-; GFX12-NEXT: v_lshl_or_b32 v1, s4, 16, v5
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: v_lshl_or_b32 v3, s0, 16, v3
+; GFX12-NEXT: v_lshl_or_b32 v1, s2, 16, v5
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -10054,36 +10054,36 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out
;
; GFX8-NOHSA-LABEL: constant_sextload_v8i8_to_v8i16:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NOHSA-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_bfe_i32 s6, s2, 0x80000
-; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v0, 8, s2
-; GFX8-NOHSA-NEXT: s_ashr_i64 s[0:1], s[2:3], 56
-; GFX8-NOHSA-NEXT: s_lshr_b32 s5, s3, 16
-; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s6
+; GFX8-NOHSA-NEXT: s_bfe_i32 s6, s0, 0x80000
+; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v0, 8, s0
+; GFX8-NOHSA-NEXT: s_ashr_i64 s[2:3], s[0:1], 56
+; GFX8-NOHSA-NEXT: s_lshr_b32 s5, s1, 16
+; GFX8-NOHSA-NEXT: s_and_b32 s3, 0xffff, s6
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v0, s1, v0
-; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s5, 0x80000
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v0, s3, v0
+; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s5, 0x80000
+; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s0, 16
+; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 16
+; GFX8-NOHSA-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX8-NOHSA-NEXT: s_or_b32 s2, s3, s2
+; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s1, 0x80000
+; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v1, 8, s1
+; GFX8-NOHSA-NEXT: s_ashr_i32 s0, s0, 24
+; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s4, 0x80000
; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16
; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1
-; GFX8-NOHSA-NEXT: s_or_b32 s0, s1, s0
-; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s3, 0x80000
-; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v1, 8, s3
-; GFX8-NOHSA-NEXT: s_lshr_b32 s4, s2, 16
-; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NOHSA-NEXT: s_and_b32 s3, 0xffff, s3
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s1, v1
-; GFX8-NOHSA-NEXT: s_ashr_i32 s1, s2, 24
-; GFX8-NOHSA-NEXT: s_bfe_i32 s2, s4, 0x80000
-; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s1, 16
-; GFX8-NOHSA-NEXT: s_and_b32 s2, 0xffff, s2
-; GFX8-NOHSA-NEXT: s_or_b32 s1, s2, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
+; GFX8-NOHSA-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s3, v1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -10179,29 +10179,29 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out
;
; GFX12-LABEL: constant_sextload_v8i8_to_v8i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT: s_load_b64 s[0:1], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_bfe_i32 s8, s2, 0x80000
-; GFX12-NEXT: s_bfe_i32 s9, s3, 0x80000
-; GFX12-NEXT: s_lshr_b32 s6, s2, 16
-; GFX12-NEXT: s_lshr_b32 s7, s3, 16
-; GFX12-NEXT: v_ashrrev_i16 v0, 8, s2
-; GFX12-NEXT: v_ashrrev_i16 v2, 8, s3
-; GFX12-NEXT: s_ashr_i64 s[4:5], s[2:3], 56
+; GFX12-NEXT: s_bfe_i32 s8, s0, 0x80000
+; GFX12-NEXT: s_bfe_i32 s9, s1, 0x80000
+; GFX12-NEXT: s_lshr_b32 s6, s0, 16
+; GFX12-NEXT: s_lshr_b32 s7, s1, 16
+; GFX12-NEXT: v_ashrrev_i16 v0, 8, s0
+; GFX12-NEXT: v_ashrrev_i16 v2, 8, s1
+; GFX12-NEXT: s_ashr_i64 s[2:3], s[0:1], 56
; GFX12-NEXT: v_and_b32_e64 v3, 0xffff, s8
; GFX12-NEXT: v_and_b32_e64 v5, 0xffff, s9
-; GFX12-NEXT: s_ashr_i32 s2, s2, 24
-; GFX12-NEXT: s_bfe_i32 s3, s6, 0x80000
-; GFX12-NEXT: s_bfe_i32 s5, s7, 0x80000
-; GFX12-NEXT: s_pack_ll_b32_b16 s2, s3, s2
-; GFX12-NEXT: s_pack_ll_b32_b16 s3, s5, s4
-; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT: s_ashr_i32 s0, s0, 24
+; GFX12-NEXT: s_bfe_i32 s1, s6, 0x80000
+; GFX12-NEXT: s_bfe_i32 s3, s7, 0x80000
+; GFX12-NEXT: s_pack_ll_b32_b16 s0, s1, s0
+; GFX12-NEXT: s_pack_ll_b32_b16 s1, s3, s2
+; GFX12-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s0
; GFX12-NEXT: v_lshl_or_b32 v0, v0, 16, v3
; GFX12-NEXT: v_lshl_or_b32 v2, v2, 16, v5
-; GFX12-NEXT: v_mov_b32_e32 v3, s3
-; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX12-NEXT: v_mov_b32_e32 v3, s1
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -10314,53 +10314,53 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
;
; GFX8-NOHSA-LABEL: constant_zextload_v16i8_to_v16i16:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s4
-; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s4, 24
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s4
-; GFX8-NOHSA-NEXT: v_alignbit_b32 v1, s3, v1, 16
-; GFX8-NOHSA-NEXT: s_and_b32 s3, s4, 0xff
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v0, 8, s0
+; GFX8-NOHSA-NEXT: s_lshr_b32 s7, s0, 24
+; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s1, 24
+; GFX8-NOHSA-NEXT: s_bfe_u32 s9, s1, 0x80010
+; GFX8-NOHSA-NEXT: s_and_b32 s10, s1, 0xff
+; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s1, 8
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s0
+; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xff
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v0, s3, v0
-; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s7, 24
-; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s3, 16
-; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s7, 0x80010
-; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s5, 24
-; GFX8-NOHSA-NEXT: s_bfe_u32 s9, s5, 0x80010
-; GFX8-NOHSA-NEXT: s_and_b32 s10, s5, 0xff
-; GFX8-NOHSA-NEXT: s_lshl_b32 s5, s5, 8
-; GFX8-NOHSA-NEXT: s_or_b32 s3, s4, s3
-; GFX8-NOHSA-NEXT: s_and_b32 s4, s7, 0xff
-; GFX8-NOHSA-NEXT: s_lshl_b32 s7, s7, 8
-; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s6
-; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s6, 24
; GFX8-NOHSA-NEXT: s_lshl_b32 s8, s8, 16
-; GFX8-NOHSA-NEXT: s_and_b32 s5, s5, 0xff0000
-; GFX8-NOHSA-NEXT: s_and_b32 s7, s7, 0xff0000
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s6
+; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v0, s0, v0
+; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s3, 24
; GFX8-NOHSA-NEXT: s_or_b32 s8, s9, s8
-; GFX8-NOHSA-NEXT: s_or_b32 s5, s10, s5
-; GFX8-NOHSA-NEXT: s_or_b32 s4, s4, s7
-; GFX8-NOHSA-NEXT: v_alignbit_b32 v3, s2, v3, 16
-; GFX8-NOHSA-NEXT: s_and_b32 s2, s6, 0xff
+; GFX8-NOHSA-NEXT: s_or_b32 s9, s10, s1
+; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16
+; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s3, 0x80010
+; GFX8-NOHSA-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NOHSA-NEXT: s_and_b32 s1, s3, 0xff
+; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s3, 8
+; GFX8-NOHSA-NEXT: s_and_b32 s3, s3, 0xff0000
+; GFX8-NOHSA-NEXT: v_lshrrev_b16_e64 v2, 8, s2
+; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s2, 24
+; GFX8-NOHSA-NEXT: s_or_b32 s1, s1, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s2
+; GFX8-NOHSA-NEXT: s_and_b32 s2, s2, 0xff
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s1
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NOHSA-NEXT: v_alignbit_b32 v3, s6, v3, 16
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s2, v2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s3
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s1
; GFX8-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s2, v2
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s0
+; GFX8-NOHSA-NEXT: v_alignbit_b32 v1, s7, v1, 16
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5]
; GFX8-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s9
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s8
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -10509,27 +10509,27 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_zextload_v16i8_to_v16i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s3, s6, 16
-; GFX12-NEXT: s_lshr_b32 s9, s7, 16
-; GFX12-NEXT: s_lshr_b32 s11, s4, 16
-; GFX12-NEXT: s_lshr_b32 s13, s5, 16
-; GFX12-NEXT: v_and_b32_e64 v4, 0xff, s5
-; GFX12-NEXT: v_and_b32_e64 v5, 0xff, s4
-; GFX12-NEXT: v_and_b32_e64 v6, 0xff, s7
-; GFX12-NEXT: v_and_b32_e64 v7, 0xff, s6
+; GFX12-NEXT: s_lshr_b32 s7, s2, 16
+; GFX12-NEXT: s_lshr_b32 s9, s3, 16
+; GFX12-NEXT: s_lshr_b32 s11, s0, 16
+; GFX12-NEXT: s_lshr_b32 s13, s1, 16
+; GFX12-NEXT: v_and_b32_e64 v4, 0xff, s1
+; GFX12-NEXT: v_and_b32_e64 v5, 0xff, s0
+; GFX12-NEXT: v_and_b32_e64 v6, 0xff, s3
+; GFX12-NEXT: v_and_b32_e64 v7, 0xff, s2
; GFX12-NEXT: v_and_b32_e64 v11, 0xff, s9
-; GFX12-NEXT: v_and_b32_e64 v12, 0xff, s3
+; GFX12-NEXT: v_and_b32_e64 v12, 0xff, s7
; GFX12-NEXT: v_and_b32_e64 v9, 0xff, s13
; GFX12-NEXT: v_and_b32_e64 v10, 0xff, s11
; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_and_b32 v5, 0xffff, v5
-; GFX12-NEXT: v_lshrrev_b16 v1, 8, s6
-; GFX12-NEXT: v_lshrrev_b16 v3, 8, s7
-; GFX12-NEXT: v_lshrrev_b16 v0, 8, s4
-; GFX12-NEXT: v_lshrrev_b16 v2, 8, s5
+; GFX12-NEXT: v_lshrrev_b16 v1, 8, s2
+; GFX12-NEXT: v_lshrrev_b16 v3, 8, s3
+; GFX12-NEXT: v_lshrrev_b16 v0, 8, s0
+; GFX12-NEXT: v_lshrrev_b16 v2, 8, s1
; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX12-NEXT: v_and_b32_e32 v7, 0xffff, v7
@@ -10537,21 +10537,21 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
; GFX12-NEXT: v_and_b32_e32 v12, 0xffff, v12
; GFX12-NEXT: v_and_b32_e32 v9, 0xffff, v9
; GFX12-NEXT: v_and_b32_e32 v10, 0xffff, v10
-; GFX12-NEXT: s_lshr_b32 s2, s6, 24
-; GFX12-NEXT: s_lshr_b32 s8, s7, 24
-; GFX12-NEXT: s_lshr_b32 s10, s4, 24
-; GFX12-NEXT: s_lshr_b32 s12, s5, 24
+; GFX12-NEXT: s_lshr_b32 s6, s2, 24
+; GFX12-NEXT: s_lshr_b32 s8, s3, 24
+; GFX12-NEXT: s_lshr_b32 s10, s0, 24
+; GFX12-NEXT: s_lshr_b32 s12, s1, 24
; GFX12-NEXT: v_lshl_or_b32 v2, v2, 16, v4
; GFX12-NEXT: v_lshl_or_b32 v0, v0, 16, v5
; GFX12-NEXT: v_lshl_or_b32 v6, v3, 16, v6
; GFX12-NEXT: v_lshl_or_b32 v4, v1, 16, v7
; GFX12-NEXT: v_lshl_or_b32 v7, s8, 16, v11
-; GFX12-NEXT: v_lshl_or_b32 v5, s2, 16, v12
+; GFX12-NEXT: v_lshl_or_b32 v5, s6, 16, v12
; GFX12-NEXT: v_lshl_or_b32 v3, s12, 16, v9
; GFX12-NEXT: v_lshl_or_b32 v1, s10, 16, v10
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -10689,62 +10689,63 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
;
; GFX8-NOHSA-LABEL: constant_sextload_v16i8_to_v16i16:
; GFX8-NOHSA: ; %bb.0:
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NOHSA-NEXT: s_bfe_i32 s10, s5, 0x80000
-; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v0, 8, s5
-; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s5, 16
+; GFX8-NOHSA-NEXT: s_bfe_i32 s10, s1, 0x80000
+; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v0, 8, s1
+; GFX8-NOHSA-NEXT: s_lshr_b32 s9, s1, 16
; GFX8-NOHSA-NEXT: s_and_b32 s10, 0xffff, s10
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s4, 16
-; GFX8-NOHSA-NEXT: s_bfe_i32 s5, s4, 0x80000
+; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s0, 16
+; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s0, 0x80000
; GFX8-NOHSA-NEXT: v_or_b32_e32 v2, s10, v0
-; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v0, 8, s4
-; GFX8-NOHSA-NEXT: s_bfe_i32 s4, s3, 0x80000
-; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v1, 8, s3
-; GFX8-NOHSA-NEXT: s_and_b32 s4, 0xffff, s4
+; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v0, 8, s0
+; GFX8-NOHSA-NEXT: s_bfe_i32 s0, s9, 0x80000
+; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v1, 8, s9
+; GFX8-NOHSA-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v3, s4, v1
-; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s2, 0x80000
-; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v1, 8, s2
-; GFX8-NOHSA-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v3, s0, v1
+; GFX8-NOHSA-NEXT: s_bfe_i32 s0, s8, 0x80000
+; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v1, 8, s8
+; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NOHSA-NEXT: s_and_b32 s0, 0xffff, s0
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NOHSA-NEXT: s_lshr_b32 s9, s7, 16
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v1, s3, v1
-; GFX8-NOHSA-NEXT: s_ashr_i64 s[2:3], s[6:7], 56
-; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s9, 0x80000
-; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 16
-; GFX8-NOHSA-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX8-NOHSA-NEXT: s_or_b32 s2, s3, s2
-; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s7, 0x80000
-; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v4, 8, s7
-; GFX8-NOHSA-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX8-NOHSA-NEXT: s_lshr_b32 s7, s3, 16
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v0, s1, v0
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v1, s0, v1
+; GFX8-NOHSA-NEXT: s_ashr_i64 s[0:1], s[2:3], 56
+; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s7, 0x80000
+; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16
+; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NOHSA-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s3, 0x80000
+; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v4, 8, s3
+; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v6, s3, v4
-; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s6, 0x80000
-; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v4, 8, s6
-; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s6, 16
-; GFX8-NOHSA-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v6, s1, v4
+; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s2, 0x80000
+; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v4, 8, s2
+; GFX8-NOHSA-NEXT: s_lshr_b32 s6, s2, 16
+; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v4, s3, v4
-; GFX8-NOHSA-NEXT: s_bfe_i32 s3, s8, 0x80000
-; GFX8-NOHSA-NEXT: s_and_b32 s5, 0xffff, s5
-; GFX8-NOHSA-NEXT: s_and_b32 s3, 0xffff, s3
-; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v5, 8, s8
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v4, s1, v4
+; GFX8-NOHSA-NEXT: s_bfe_i32 s1, s6, 0x80000
+; GFX8-NOHSA-NEXT: s_and_b32 s1, 0xffff, s1
+; GFX8-NOHSA-NEXT: v_ashrrev_i16_e64 v5, 8, s6
; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s2
-; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v5, s3, v5
-; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s3
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s2
-; GFX8-NOHSA-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s0
+; GFX8-NOHSA-NEXT: s_add_u32 s0, s4, 16
+; GFX8-NOHSA-NEXT: v_or_b32_e32 v5, s1, v5
+; GFX8-NOHSA-NEXT: s_addc_u32 s1, s5, 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s1
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s0
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
-; GFX8-NOHSA-NEXT: v_or_b32_e32 v0, s5, v0
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
-; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NOHSA-NEXT: s_nop 0
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NOHSA-NEXT: s_endpgm
;
@@ -10920,40 +10921,40 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
;
; GFX12-LABEL: constant_sextload_v16i8_to_v16i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX12-NEXT: s_load_b128 s[0:3], s[6:7], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_lshr_b32 s8, s6, 16
-; GFX12-NEXT: v_ashrrev_i16 v5, 8, s6
+; GFX12-NEXT: s_lshr_b32 s6, s2, 16
+; GFX12-NEXT: v_ashrrev_i16 v5, 8, s2
+; GFX12-NEXT: s_bfe_i32 s2, s2, 0x80000
+; GFX12-NEXT: s_lshr_b32 s8, s0, 16
+; GFX12-NEXT: s_lshr_b32 s9, s1, 16
+; GFX12-NEXT: v_ashrrev_i16 v0, 8, s1
+; GFX12-NEXT: s_bfe_i32 s10, s1, 0x80000
+; GFX12-NEXT: v_ashrrev_i16 v1, 8, s0
+; GFX12-NEXT: s_bfe_i32 s11, s0, 0x80000
+; GFX12-NEXT: s_bfe_i32 s12, s3, 0x80000
+; GFX12-NEXT: s_ashr_i64 s[0:1], s[2:3], 56
+; GFX12-NEXT: v_ashrrev_i16 v13, 8, s6
; GFX12-NEXT: s_bfe_i32 s6, s6, 0x80000
-; GFX12-NEXT: s_lshr_b32 s10, s4, 16
-; GFX12-NEXT: s_lshr_b32 s11, s5, 16
-; GFX12-NEXT: v_ashrrev_i16 v1, 8, s4
-; GFX12-NEXT: s_bfe_i32 s4, s4, 0x80000
-; GFX12-NEXT: v_ashrrev_i16 v0, 8, s5
-; GFX12-NEXT: s_bfe_i32 s5, s5, 0x80000
-; GFX12-NEXT: s_bfe_i32 s12, s7, 0x80000
-; GFX12-NEXT: s_ashr_i64 s[2:3], s[6:7], 56
-; GFX12-NEXT: v_and_b32_e64 v12, 0xffff, s6
-; GFX12-NEXT: s_bfe_i32 s6, s8, 0x80000
-; GFX12-NEXT: s_lshr_b32 s9, s7, 16
-; GFX12-NEXT: v_and_b32_e64 v7, 0xffff, s4
-; GFX12-NEXT: s_bfe_i32 s3, s11, 0x80000
-; GFX12-NEXT: s_bfe_i32 s4, s10, 0x80000
-; GFX12-NEXT: v_ashrrev_i16 v2, 8, s7
-; GFX12-NEXT: v_and_b32_e64 v4, 0xffff, s5
+; GFX12-NEXT: s_lshr_b32 s7, s3, 16
+; GFX12-NEXT: v_ashrrev_i16 v2, 8, s3
+; GFX12-NEXT: s_bfe_i32 s1, s9, 0x80000
+; GFX12-NEXT: s_bfe_i32 s3, s8, 0x80000
+; GFX12-NEXT: v_and_b32_e64 v4, 0xffff, s10
+; GFX12-NEXT: v_and_b32_e64 v7, 0xffff, s11
; GFX12-NEXT: v_and_b32_e64 v11, 0xffff, s12
-; GFX12-NEXT: v_ashrrev_i16 v13, 8, s8
+; GFX12-NEXT: v_and_b32_e64 v12, 0xffff, s2
; GFX12-NEXT: v_and_b32_e64 v16, 0xffff, s6
-; GFX12-NEXT: v_ashrrev_i16 v9, 8, s11
-; GFX12-NEXT: v_ashrrev_i16 v10, 8, s10
-; GFX12-NEXT: s_bfe_i32 s5, s9, 0x80000
-; GFX12-NEXT: v_and_b32_e64 v14, 0xffff, s3
-; GFX12-NEXT: v_and_b32_e64 v15, 0xffff, s4
-; GFX12-NEXT: s_pack_ll_b32_b16 s2, s5, s2
+; GFX12-NEXT: v_ashrrev_i16 v9, 8, s9
+; GFX12-NEXT: v_ashrrev_i16 v10, 8, s8
+; GFX12-NEXT: s_bfe_i32 s2, s7, 0x80000
+; GFX12-NEXT: v_and_b32_e64 v14, 0xffff, s1
+; GFX12-NEXT: v_and_b32_e64 v15, 0xffff, s3
+; GFX12-NEXT: s_pack_ll_b32_b16 s0, s2, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v3, s2
+; GFX12-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v3, s0
; GFX12-NEXT: v_lshl_or_b32 v6, v0, 16, v4
; GFX12-NEXT: v_lshl_or_b32 v4, v1, 16, v7
; GFX12-NEXT: v_lshl_or_b32 v2, v2, 16, v11
@@ -10962,8 +10963,8 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
; GFX12-NEXT: v_lshl_or_b32 v7, v9, 16, v14
; GFX12-NEXT: v_lshl_or_b32 v5, v10, 16, v15
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
-; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5] offset:16
+; GFX12-NEXT: global_store_b128 v8, v[4:7], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 21e27bfa75531..8a4090120c239 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -40,19 +40,19 @@ define amdgpu_kernel void @global_load_i16(ptr addrspace(1) %out, ptr addrspace(
;
; GCN-NOHSA-VI-LABEL: global_load_i16:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_load_i16:
@@ -145,19 +145,19 @@ define amdgpu_kernel void @global_load_v2i16(ptr addrspace(1) %out, ptr addrspac
;
; GCN-NOHSA-VI-LABEL: global_load_v2i16:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_load_v2i16:
@@ -236,20 +236,20 @@ define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspac
;
; GCN-NOHSA-VI-LABEL: global_load_v3i16:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4
-; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4
+; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_load_v3i16:
@@ -362,19 +362,19 @@ define amdgpu_kernel void @global_load_v4i16(ptr addrspace(1) %out, ptr addrspac
;
; GCN-NOHSA-VI-LABEL: global_load_v4i16:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_load_v4i16:
@@ -447,19 +447,19 @@ define amdgpu_kernel void @global_load_v8i16(ptr addrspace(1) %out, ptr addrspac
;
; GCN-NOHSA-VI-LABEL: global_load_v8i16:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_load_v8i16:
@@ -546,22 +546,22 @@ define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspa
;
; GCN-NOHSA-VI-LABEL: global_load_v16i16:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1)
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1)
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_load_v16i16:
@@ -696,30 +696,30 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a
;
; GCN-NOHSA-VI-LABEL: global_load_v16i16_align2:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0 offset:14
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:10
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:6
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v3, off, s[4:7], 0 offset:2
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v4, off, s[4:7], 0 offset:30
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v5, off, s[4:7], 0 offset:26
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v6, off, s[4:7], 0 offset:22
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v7, off, s[4:7], 0 offset:18
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v8, off, s[4:7], 0 offset:12
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v9, off, s[4:7], 0 offset:8
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v10, off, s[4:7], 0 offset:4
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v11, off, s[4:7], 0
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v12, off, s[4:7], 0 offset:28
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v13, off, s[4:7], 0 offset:24
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v14, off, s[4:7], 0 offset:20
-; GCN-NOHSA-VI-NEXT: buffer_load_ushort v15, off, s[4:7], 0 offset:16
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:14
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:10
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:6
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:2
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:30
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v5, off, s[0:3], 0 offset:26
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v6, off, s[0:3], 0 offset:22
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v7, off, s[0:3], 0 offset:18
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v8, off, s[0:3], 0 offset:12
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v9, off, s[0:3], 0 offset:8
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v10, off, s[0:3], 0 offset:4
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v11, off, s[0:3], 0
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v12, off, s[0:3], 0 offset:28
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v13, off, s[0:3], 0 offset:24
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v14, off, s[0:3], 0 offset:20
+; GCN-NOHSA-VI-NEXT: buffer_load_ushort v15, off, s[0:3], 0 offset:16
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s7
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(14)
; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
@@ -751,8 +751,8 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a
; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v5, v14, v18
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v4, v15, v19
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_load_v16i16_align2:
@@ -834,19 +834,19 @@ define amdgpu_kernel void @global_zextload_i16_to_i32(ptr addrspace(1) %out, ptr
;
; GCN-NOHSA-VI-LABEL: global_zextload_i16_to_i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_i16_to_i32:
@@ -919,19 +919,19 @@ define amdgpu_kernel void @global_sextload_i16_to_i32(ptr addrspace(1) %out, ptr
;
; GCN-NOHSA-VI-LABEL: global_sextload_i16_to_i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_sshort v0, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_i16_to_i32:
@@ -1007,19 +1007,19 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_zextload_v1i16_to_v1i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v1i16_to_v1i32:
@@ -1092,19 +1092,19 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_sextload_v1i16_to_v1i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_sshort v0, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v1i16_to_v1i32:
@@ -1184,21 +1184,21 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_zextload_v2i16_to_v2i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v2i16_to_v2i32:
@@ -1283,21 +1283,21 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_sextload_v2i16_to_v2i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v0
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v2i16_to_v2i32:
@@ -1385,22 +1385,22 @@ define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_zextload_v3i16_to_v3i32:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v1
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v3i16_to_v3i32:
@@ -1495,22 +1495,22 @@ define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_sextload_v3i16_to_v3i32:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[3:4], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v3
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v4, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v3, 0, 16
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v3i16_to_v3i32:
@@ -1613,23 +1613,23 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_zextload_v4i16_to_v4i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xffff, v1
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v4i16_to_v4i32:
@@ -1729,23 +1729,23 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_sextload_v4i16_to_v4i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[4:5], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 16, v5
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v4
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v5, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v4, 0, 16
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v4i16_to_v4i32:
@@ -1859,17 +1859,17 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_zextload_v8i16_to_v8i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v3
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xffff, v3
@@ -1879,8 +1879,8 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out,
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xffff, v1
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v0
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v8i16_to_v8i32:
@@ -2008,17 +2008,17 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_sextload_v8i16_to_v8i32:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 16, v3
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 16, v2
@@ -2028,8 +2028,8 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out,
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 16, v0
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v1, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v0, 0, 16
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v8i16_to_v8i32:
@@ -5158,21 +5158,21 @@ define amdgpu_kernel void @global_zextload_i16_to_i64(ptr addrspace(1) %out, ptr
;
; GCN-NOHSA-VI-LABEL: global_zextload_i16_to_i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_i16_to_i64:
@@ -5255,21 +5255,21 @@ define amdgpu_kernel void @global_sextload_i16_to_i64(ptr addrspace(1) %out, ptr
;
; GCN-NOHSA-VI-LABEL: global_sextload_i16_to_i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v0, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_i16_to_i64:
@@ -5350,21 +5350,21 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_zextload_v1i16_to_v1i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v1i16_to_v1i64:
@@ -5442,21 +5442,21 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_sextload_v1i16_to_v1i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v0, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v1i16_to_v1i64:
@@ -5543,23 +5543,23 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_zextload_v2i16_to_v2i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v2i16_to_v2i64:
@@ -5653,24 +5653,24 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_sextload_v2i16_to_v2i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v1, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v2, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v2i16_to_v2i64:
@@ -5779,19 +5779,19 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_zextload_v4i16_to_v4i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[8:9], off, s[8:11], 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v1
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, v1
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
@@ -5799,8 +5799,8 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out,
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v9
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v8
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xffff, v8
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v4i16_to_v4i64:
@@ -5925,17 +5925,17 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out,
;
; GCN-NOHSA-VI-LABEL: global_sextload_v4i16_to_v4i64:
; GCN-NOHSA-VI: ; %bb.0:
-; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
+; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[1:2], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
-; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
+; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, v2
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
@@ -5948,8 +5948,8 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out,
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v4i16_to_v4i64:
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index 0f9cc33d731f1..121c43623f669 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -39,19 +39,19 @@ define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace(
;
; GCNX3-NOHSA-LABEL: global_load_i32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
-; GCNX3-NOHSA-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_load_i32:
@@ -118,19 +118,19 @@ define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspac
;
; GCNX3-NOHSA-LABEL: global_load_v2i32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_load_v2i32:
@@ -198,19 +198,19 @@ define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspac
;
; GCNX3-NOHSA-LABEL: global_load_v3i32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx3 v[0:2], off, s[8:11], 0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_load_v3i32:
@@ -282,19 +282,19 @@ define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspac
;
; GCNX3-NOHSA-LABEL: global_load_v4i32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_load_v4i32:
@@ -375,22 +375,22 @@ define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspac
;
; GCNX3-NOHSA-LABEL: global_load_v8i32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_load_v8i32:
@@ -492,25 +492,25 @@ define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspac
;
; GCNX3-NOHSA-LABEL: global_load_v9i32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GCNX3-NOHSA-NEXT: buffer_load_dword v8, off, s[8:11], 0 offset:32
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
-; GCNX3-NOHSA-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:32
+; GCNX3-NOHSA-NEXT: buffer_store_dword v8, off, s[0:3], 0 offset:32
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_load_v9i32:
@@ -623,25 +623,25 @@ define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspa
;
; GCNX3-NOHSA-LABEL: global_load_v10i32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GCNX3-NOHSA-NEXT: buffer_load_dwordx2 v[8:9], off, s[8:11], 0 offset:32
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[8:9], off, s[4:7], 0 offset:32
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[8:9], off, s[0:3], 0 offset:32
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_load_v10i32:
@@ -753,25 +753,25 @@ define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspa
;
; GCNX3-NOHSA-LABEL: global_load_v11i32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GCNX3-NOHSA-NEXT: buffer_load_dwordx3 v[8:10], off, s[8:11], 0 offset:32
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx3 v[8:10], off, s[4:7], 0 offset:32
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx3 v[8:10], off, s[0:3], 0 offset:32
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_load_v11i32:
@@ -888,25 +888,25 @@ define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspa
;
; GCNX3-NOHSA-LABEL: global_load_v12i32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_load_v12i32:
@@ -1032,28 +1032,28 @@ define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspa
;
; GCNX3-NOHSA-LABEL: global_load_v16i32:
; GCNX3-NOHSA: ; %bb.0: ; %entry
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:16
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:32
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:48
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(3)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_load_v16i32:
@@ -1147,20 +1147,20 @@ define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr
;
; GCNX3-NOHSA-LABEL: global_zextload_i32_to_i64:
; GCNX3-NOHSA: ; %bb.0:
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v1, 0
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_i32_to_i64:
@@ -1230,20 +1230,20 @@ define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr
;
; GCNX3-NOHSA-LABEL: global_sextload_i32_to_i64:
; GCNX3-NOHSA: ; %bb.0:
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_i32_to_i64:
@@ -1314,20 +1314,20 @@ define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out,
;
; GCNX3-NOHSA-LABEL: global_zextload_v1i32_to_v1i64:
; GCNX3-NOHSA: ; %bb.0:
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v1, 0
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v1i32_to_v1i64:
@@ -1397,20 +1397,20 @@ define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out,
;
; GCNX3-NOHSA-LABEL: global_sextload_v1i32_to_v1i64:
; GCNX3-NOHSA: ; %bb.0:
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v1i32_to_v1i64:
@@ -1487,23 +1487,23 @@ define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out,
;
; GCNX3-NOHSA-LABEL: global_zextload_v2i32_to_v2i64:
; GCNX3-NOHSA: ; %bb.0:
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v1, 0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v0, v2
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v2, v3
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v3, v1
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v2i32_to_v2i64:
@@ -1583,22 +1583,22 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out,
;
; GCNX3-NOHSA-LABEL: global_sextload_v2i32_to_v2i64:
; GCNX3-NOHSA: ; %bb.0:
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v3, 31, v1
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v2, v1
; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v2i32_to_v2i64:
@@ -1694,27 +1694,27 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out,
;
; GCNX3-NOHSA-LABEL: global_zextload_v4i32_to_v4i64:
; GCNX3-NOHSA: ; %bb.0:
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v5, 0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v7, v5
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v4, v2
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v6, v3
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; GCNX3-NOHSA-NEXT: s_nop 0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v4, v0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v6, v1
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v4i32_to_v4i64:
@@ -1821,17 +1821,17 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out,
;
; GCNX3-NOHSA-LABEL: global_sextload_v4i32_to_v4i64:
; GCNX3-NOHSA: ; %bb.0:
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(0)
; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v10, 31, v3
; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v8, 31, v2
@@ -1841,8 +1841,8 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out,
; GCNX3-NOHSA-NEXT: v_ashrrev_i32_e32 v4, 31, v0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v3, v0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v5, v1
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[7:10], off, s[4:7], 0 offset:16
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[3:6], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:16
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v4i32_to_v4i64:
@@ -1981,36 +1981,36 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out,
;
; GCNX3-NOHSA-LABEL: global_zextload_v8i32_to_v8i64:
; GCNX3-NOHSA: ; %bb.0:
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v9, 0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v11, v9
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(1)
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v2
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v3
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
; GCNX3-NOHSA-NEXT: s_nop 0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v1
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(2)
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v6
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v7
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
; GCNX3-NOHSA-NEXT: s_nop 0
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v8, v4
; GCNX3-NOHSA-NEXT: v_mov_b32_e32 v10, v5
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v8i32_to_v8i64:
@@ -4515,14 +4515,14 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa
;
; GCNX3-NOHSA-LABEL: global_load_v32i32:
; GCNX3-NOHSA: ; %bb.0:
-; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCNX3-NOHSA-NEXT: s_mov_b32 s7, 0xf000
-; GCNX3-NOHSA-NEXT: s_mov_b32 s6, -1
-; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s6
-; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCNX3-NOHSA-NEXT: s_mov_b32 s3, 0xf000
+; GCNX3-NOHSA-NEXT: s_mov_b32 s2, -1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s10, s2
+; GCNX3-NOHSA-NEXT: s_mov_b32 s11, s3
; GCNX3-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
-; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s2
-; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT: s_mov_b32 s8, s6
+; GCNX3-NOHSA-NEXT: s_mov_b32 s9, s7
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:112
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:96
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:80
@@ -4531,22 +4531,22 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:48
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0
; GCNX3-NOHSA-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:16
-; GCNX3-NOHSA-NEXT: s_mov_b32 s4, s0
-; GCNX3-NOHSA-NEXT: s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT: s_mov_b32 s0, s4
+; GCNX3-NOHSA-NEXT: s_mov_b32 s1, s5
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(6)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:96
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:112
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(6)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:64
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:80
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:64
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:80
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:32
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0 offset:48
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0
; GCNX3-NOHSA-NEXT: s_waitcnt vmcnt(7)
-; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:16
; GCNX3-NOHSA-NEXT: s_endpgm
;
; EG-LABEL: global_load_v32i32:
diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
index f19eeee1ca741..76d52686a9feb 100644
--- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
@@ -4,25 +4,25 @@
define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s, i32 %n) {
; GCN-LABEL: copy_flat:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b32 s4, s[0:1], 0x34
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x34
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: s_cmp_eq_u32 s4, 0
+; GCN-NEXT: s_cmp_eq_u32 s2, 0
; GCN-NEXT: s_cbranch_scc1 .LBB0_3
; GCN-NEXT: ; %bb.1: ; %for.body.preheader
-; GCN-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0
+; GCN-NEXT: s_add_nc_u64 s[0:1], s[6:7], 0xb0
; GCN-NEXT: .LBB0_2: ; %for.body
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GCN-NEXT: s_prefetch_data s[2:3], 0x0, null, 0
-; GCN-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
-; GCN-NEXT: s_add_co_i32 s4, s4, -1
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
+; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GCN-NEXT: s_add_co_i32 s2, s2, -1
; GCN-NEXT: flat_load_b128 v[0:3], v[0:1] offset:-176
-; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
+; GCN-NEXT: s_cmp_lg_u32 s2, 0
+; GCN-NEXT: s_add_nc_u64 s[4:5], s[4:5], 16
; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
; GCN-NEXT: flat_store_b128 v[4:5], v[0:3]
; GCN-NEXT: s_cbranch_scc1 .LBB0_2
@@ -50,25 +50,25 @@ for.end: ; preds = %for.body, %entry
define amdgpu_kernel void @copy_global(ptr addrspace(1) nocapture %d, ptr addrspace(1) nocapture readonly %s, i32 %n) {
; GCN-LABEL: copy_global:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b32 s4, s[0:1], 0x34
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x34
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: s_cmp_eq_u32 s4, 0
+; GCN-NEXT: s_cmp_eq_u32 s2, 0
; GCN-NEXT: s_cbranch_scc1 .LBB1_3
; GCN-NEXT: ; %bb.1: ; %for.body.preheader
-; GCN-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0
+; GCN-NEXT: s_add_nc_u64 s[0:1], s[6:7], 0xb0
; GCN-NEXT: .LBB1_2: ; %for.body
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT: global_load_b128 v[1:4], v0, s[2:3] offset:-176
-; GCN-NEXT: s_prefetch_data s[2:3], 0x0, null, 0
-; GCN-NEXT: s_add_co_i32 s4, s4, -1
-; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_wait_loadcnt 0x0
-; GCN-NEXT: global_store_b128 v0, v[1:4], s[0:1]
+; GCN-NEXT: global_load_b128 v[1:4], v0, s[0:1] offset:-176
+; GCN-NEXT: s_prefetch_data s[0:1], 0x0, null, 0
+; GCN-NEXT: s_add_co_i32 s2, s2, -1
; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
+; GCN-NEXT: s_cmp_lg_u32 s2, 0
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: global_store_b128 v0, v[1:4], s[4:5]
+; GCN-NEXT: s_add_nc_u64 s[4:5], s[4:5], 16
; GCN-NEXT: s_cbranch_scc1 .LBB1_2
; GCN-NEXT: .LBB1_3: ; %for.end
; GCN-NEXT: s_nop 0
@@ -96,26 +96,26 @@ for.end: ; preds = %for.body, %entry
define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addrspace(4) nocapture readonly %s, i32 %n) {
; GCN-LABEL: copy_constant:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b32 s4, s[0:1], 0x34
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x34
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: s_cmp_eq_u32 s4, 0
+; GCN-NEXT: s_cmp_eq_u32 s2, 0
; GCN-NEXT: s_cbranch_scc1 .LBB2_3
; GCN-NEXT: ; %bb.1: ; %for.body.preheader
-; GCN-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: .LBB2_2: ; %for.body
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: s_load_b128 s[8:11], s[2:3], 0x0
-; GCN-NEXT: s_prefetch_data s[2:3], 0xb0, null, 0
-; GCN-NEXT: s_add_co_i32 s4, s4, -1
-; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16
-; GCN-NEXT: s_cmp_lg_u32 s4, 0
+; GCN-NEXT: s_load_b128 s[8:11], s[6:7], 0x0
+; GCN-NEXT: s_prefetch_data s[6:7], 0xb0, null, 0
+; GCN-NEXT: s_add_co_i32 s2, s2, -1
+; GCN-NEXT: s_add_nc_u64 s[6:7], s[6:7], 16
+; GCN-NEXT: s_cmp_lg_u32 s2, 0
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s9
; GCN-NEXT: v_dual_mov_b32 v3, s10 :: v_dual_mov_b32 v4, s11
-; GCN-NEXT: global_store_b128 v0, v[1:4], s[0:1]
-; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
+; GCN-NEXT: global_store_b128 v0, v[1:4], s[4:5]
+; GCN-NEXT: s_add_nc_u64 s[4:5], s[4:5], 16
; GCN-NEXT: s_cbranch_scc1 .LBB2_2
; GCN-NEXT: .LBB2_3: ; %for.end
; GCN-NEXT: s_nop 0
@@ -143,20 +143,20 @@ for.end: ; preds = %for.body, %entry
define amdgpu_kernel void @copy_local(ptr addrspace(3) nocapture %d, ptr addrspace(3) nocapture readonly %s, i32 %n) {
; GCN-LABEL: copy_local:
; GCN: ; %bb.0: ; %entry
-; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24
+; GCN-NEXT: s_load_b96 s[4:6], s[0:1], 0x24
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: s_cmp_eq_u32 s2, 0
+; GCN-NEXT: s_cmp_eq_u32 s6, 0
; GCN-NEXT: s_cbranch_scc1 .LBB3_2
; GCN-NEXT: .LBB3_1: ; %for.body
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT: v_mov_b32_e32 v2, s1
-; GCN-NEXT: v_mov_b32_e32 v4, s0
-; GCN-NEXT: s_add_co_i32 s2, s2, -1
-; GCN-NEXT: s_add_co_i32 s0, s0, 16
-; GCN-NEXT: s_add_co_i32 s1, s1, 16
+; GCN-NEXT: v_mov_b32_e32 v2, s5
+; GCN-NEXT: v_mov_b32_e32 v4, s4
+; GCN-NEXT: s_add_co_i32 s6, s6, -1
+; GCN-NEXT: s_add_co_i32 s4, s4, 16
+; GCN-NEXT: s_add_co_i32 s5, s5, 16
; GCN-NEXT: ds_load_2addr_b32 v[0:1], v2 offset0:2 offset1:3
; GCN-NEXT: ds_load_2addr_b32 v[2:3], v2 offset1:1
-; GCN-NEXT: s_cmp_lg_u32 s2, 0
+; GCN-NEXT: s_cmp_lg_u32 s6, 0
; GCN-NEXT: s_wait_dscnt 0x1
; GCN-NEXT: ds_store_2addr_b32 v4, v0, v1 offset0:2 offset1:3
; GCN-NEXT: s_wait_dscnt 0x1
diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
index cb3ea2e812770..ad4af2f4c948e 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
@@ -24,13 +24,13 @@ define protected amdgpu_kernel void @test(ptr addrspace(1) nocapture %ptr.coerce
; GCN-NEXT: ds_write_b8 v0, v1
; GCN-NEXT: ds_read_u8 v2, v0 offset:2
; GCN-NEXT: ds_read_u16 v3, v0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_write_b8 v0, v2 offset:6
; GCN-NEXT: ds_write_b16 v0, v3 offset:4
-; GCN-NEXT: v_cmp_eq_u16_sdwa s[2:3], v3, v1 src0_sel:BYTE_0 src1_sel:DWORD
-; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3]
-; GCN-NEXT: global_store_byte v0, v1, s[0:1]
+; GCN-NEXT: v_cmp_eq_u16_sdwa s[0:1], v3, v1 src0_sel:BYTE_0 src1_sel:DWORD
+; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GCN-NEXT: global_store_byte v0, v1, s[2:3]
; GCN-NEXT: s_endpgm
; CHECK-LABEL: define protected amdgpu_kernel void @test(
; CHECK-SAME: ptr addrspace(1) nocapture [[PTR_COERCE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
index c6a734a065ff1..32318abd49209 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -O3 < %s | FileCheck -check-prefix=GCN %s
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
@@ -37,10 +38,10 @@ define amdgpu_kernel void @no_clobber_ds_load_stores_x2(ptr addrspace(1) %arg, i
; GCN-NEXT: ds_write_b32 v1, v2 offset:256
; GCN-NEXT: ds_read_b32 v2, v0
; GCN-NEXT: ds_read_b32 v0, v0 offset:256
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_add_u32_e32 v0, v2, v0
-; GCN-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
store i32 1, ptr addrspace(3) @a, align 4
@@ -88,11 +89,11 @@ define amdgpu_kernel void @no_clobber_ds_load_stores_x3(ptr addrspace(1) %arg, i
; GCN-NEXT: ds_read_b32 v2, v0
; GCN-NEXT: ds_read_b32 v3, v0 offset:256
; GCN-NEXT: ds_read_b32 v0, v0 offset:512
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_add_u32_e32 v2, v2, v3
; GCN-NEXT: v_add_u32_e32 v0, v2, v0
-; GCN-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
store i32 1, ptr addrspace(3) @a, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll b/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll
index e9a1b38eee157..b11cd198eb41c 100644
--- a/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll
@@ -74,7 +74,7 @@ define i64 @add_u64_vv(i64 %v, i64 %a) {
define amdgpu_kernel void @add_u64_sv(i64 %v) {
; GCN-LABEL: add_u64_sv:
-; GCN: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GCN: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
%a = load i64, ptr undef
%add = add i64 %v, %a
store i64 %add, ptr undef
@@ -83,7 +83,7 @@ define amdgpu_kernel void @add_u64_sv(i64 %v) {
define amdgpu_kernel void @add_u64_vs(i64 %a) {
; GCN-LABEL: add_u64_vs:
-; GCN: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GCN: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3]
%v = load i64, ptr undef
%add = add i64 %v, %a
store i64 %add, ptr undef
@@ -93,7 +93,7 @@ define amdgpu_kernel void @add_u64_vs(i64 %a) {
define amdgpu_kernel void @add_u64_ss(i64 %v, i64 %a) {
; GCN-LABEL: add_u64_ss:
; GCN: s_add_u32
-; GCN: s_addc_u32 s1, s1, s3
+; GCN: s_addc_u32 s1, s5, s7
%add = add i64 %v, %a
store i64 %add, ptr undef
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
index 67071327e3aff..7361e575f4884 100644
--- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
@@ -79,26 +79,26 @@ define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <
define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: v_lshr_v2i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshrrev_b16 v0, v1, v0
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_lshr_v2i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b16_e32 v4, v1, v0
@@ -131,24 +131,24 @@ define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX10-LABEL: v_lshr_v2i16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_lshrrev_b16 v0, v1, v0
-; GFX10-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v2, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_lshr_v2i16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_lshrrev_b16 v0, v1, v0
-; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v2, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -353,27 +353,27 @@ define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1
define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: lshr_imm_v_v2i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, 8 op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: lshr_imm_v_v2i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: v_mov_b32_e32 v4, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b16_e64 v2, v3, 8
@@ -404,24 +404,24 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace
;
; GFX10-LABEL: lshr_imm_v_v2i16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_lshrrev_b16 v1, v1, 8 op_sel_hi:[1,0]
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: lshr_imm_v_v2i16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_lshrrev_b16 v1, v1, 8 op_sel_hi:[1,0]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -438,26 +438,26 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace
define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: lshr_v_imm_v2i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: lshr_v_imm_v2i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v2, 24, v3
@@ -485,24 +485,24 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace
;
; GFX10-LABEL: lshr_v_imm_v2i16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: lshr_v_imm_v2i16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -519,27 +519,27 @@ define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace
define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: v_lshr_v4i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshrrev_b16 v1, v3, v1
; GFX9-NEXT: v_pk_lshrrev_b16 v0, v2, v0
-; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_lshr_v4i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b16_e32 v6, v3, v1
@@ -582,26 +582,26 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX10-LABEL: v_lshr_v4i16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_lshrrev_b16 v1, v3, v1
; GFX10-NEXT: v_pk_lshrrev_b16 v0, v2, v0
-; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_lshr_v4i16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3]
+; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_lshrrev_b16 v1, v3, v1
; GFX11-NEXT: v_pk_lshrrev_b16 v0, v2, v0
-; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v4, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -620,27 +620,27 @@ define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1)
define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: lshr_v_imm_v4i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: lshr_v_imm_v4i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v4, 24, v1
@@ -673,26 +673,26 @@ define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace
;
; GFX10-LABEL: lshr_v_imm_v4i16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: lshr_v_imm_v4i16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/mad.u16.ll b/llvm/test/CodeGen/AMDGPU/mad.u16.ll
index 995c8c8679397..5fd0144abf9d7 100644
--- a/llvm/test/CodeGen/AMDGPU/mad.u16.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad.u16.ll
@@ -9,17 +9,17 @@
define amdgpu_kernel void @mad_u16(
; GFX8-LABEL: mad_u16:
; GFX8: ; %bb.0: ; %entry
-; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v4
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s4, v4
+; GFX8-NEXT: v_mov_b32_e32 v3, s9
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s8, v4
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; GFX8-NEXT: v_mov_b32_e32 v5, s7
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, s6, v4
+; GFX8-NEXT: v_mov_b32_e32 v5, s11
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, s10, v4
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; GFX8-NEXT: flat_load_ushort v6, v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -27,57 +27,57 @@ define amdgpu_kernel void @mad_u16(
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_load_ushort v3, v[4:5] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mad_u16 v2, v6, v2, v3
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: mad_u16:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_ushort v2, v0, s[4:5] glc
+; GFX9-NEXT: global_load_ushort v2, v0, s[8:9] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_ushort v3, v0, s[6:7] glc
+; GFX9-NEXT: global_load_ushort v3, v0, s[10:11] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: mad_u16:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_load_ushort v2, v0, s[4:5] glc dlc
+; GFX10-NEXT: global_load_ushort v2, v0, s[8:9] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_load_ushort v3, v0, s[6:7] glc dlc
+; GFX10-NEXT: global_load_ushort v3, v0, s[10:11] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: v_mad_u16 v1, v1, v2, v3
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: mad_u16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_u16 v2, v0, s[4:5] glc dlc
+; GFX11-NEXT: global_load_u16 v2, v0, s[8:9] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_u16 v0, v0, s[6:7] glc dlc
+; GFX11-NEXT: global_load_u16 v0, v0, s[10:11] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mad_u16 v0, v1, v2, v0
-; GFX11-NEXT: global_store_b16 v3, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v3, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index 400298bcff4f9..c1c526c33bb78 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -959,12 +959,12 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mul_i32 s2, s6, s7
-; GFX11-NEXT: s_mul_hi_u32 s3, s6, s7
-; GFX11-NEXT: s_add_u32 s0, s2, s0
-; GFX11-NEXT: s_addc_u32 s1, s3, s1
+; GFX11-NEXT: s_mul_i32 s0, s6, s7
+; GFX11-NEXT: s_mul_hi_u32 s1, s6, s7
+; GFX11-NEXT: s_add_u32 s0, s0, s2
+; GFX11-NEXT: s_addc_u32 s1, s1, s3
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
@@ -976,15 +976,15 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0,
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT: s_mov_b32 s3, 0
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX12-NEXT: s_mov_b32 s1, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s2, s6
+; GFX12-NEXT: s_mov_b32 s0, s6
; GFX12-NEXT: s_mov_b32 s6, s7
-; GFX12-NEXT: s_mov_b32 s7, s3
+; GFX12-NEXT: s_mov_b32 s7, s1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], s[6:7]
-; GFX12-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
+; GFX12-NEXT: s_mul_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll
index def0dfa4b903b..3bb573246d865 100644
--- a/llvm/test/CodeGen/AMDGPU/madak.ll
+++ b/llvm/test/CodeGen/AMDGPU/madak.ll
@@ -35,14 +35,14 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac
; GFX8-LABEL: madak_f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v5, v[0:1]
; GFX8-NEXT: flat_load_dword v2, v[2:3]
@@ -86,12 +86,12 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac
; GFX11-MAD: ; %bb.0:
; GFX11-MAD-NEXT: s_clause 0x1
; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-MAD-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-MAD-NEXT: s_clause 0x1
; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-MAD-NEXT: s_waitcnt vmcnt(0)
; GFX11-MAD-NEXT: v_mul_f32_e32 v1, v1, v2
; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -133,12 +133,12 @@ define amdgpu_kernel void @madak_f32(ptr addrspace(1) noalias %out, ptr addrspac
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fmaak_f32 v1, v1, v2, 0x41200000
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -504,14 +504,14 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p
; GFX8-LABEL: madak_inline_imm_f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v5, v[0:1]
; GFX8-NEXT: flat_load_dword v2, v[2:3]
@@ -555,12 +555,12 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p
; GFX11-MAD: ; %bb.0:
; GFX11-MAD-NEXT: s_clause 0x1
; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-MAD-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-MAD-NEXT: s_clause 0x1
; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-MAD-NEXT: s_waitcnt vmcnt(0)
; GFX11-MAD-NEXT: v_mul_f32_e32 v1, v1, v2
; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -602,12 +602,12 @@ define amdgpu_kernel void @madak_inline_imm_f32(ptr addrspace(1) noalias %out, p
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fma_f32 v1, v1, v2, 4.0
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -835,13 +835,13 @@ define amdgpu_kernel void @v_s_madak_f32(ptr addrspace(1) noalias %out, float %a
; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-MAD-NEXT: s_clause 0x1
-; GFX11-MAD-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-MAD-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-MAD-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-MAD-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-MAD-NEXT: v_mul_f32_e32 v1, s2, v1
+; GFX11-MAD-NEXT: v_mul_f32_e32 v1, s4, v1
; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-MAD-NEXT: v_add_f32_e32 v1, 0x41200000, v1
-; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-MAD-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-MAD-NEXT: s_nop 0
; GFX11-MAD-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-MAD-NEXT: s_endpgm
@@ -882,11 +882,11 @@ define amdgpu_kernel void @v_s_madak_f32(ptr addrspace(1) noalias %out, float %a
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-FMA-NEXT: s_clause 0x1
-; GFX11-FMA-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-FMA-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-FMA-NEXT: v_fmaak_f32 v1, s2, v1, 0x41200000
-; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-FMA-NEXT: v_fmaak_f32 v1, s4, v1, 0x41200000
+; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-FMA-NEXT: s_nop 0
; GFX11-FMA-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-FMA-NEXT: s_endpgm
@@ -1024,20 +1024,20 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias %
; GFX8-LABEL: no_madak_src0_modifier_f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; GFX8-NEXT: s_mov_b32 s0, 0x41200000
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v5, v[0:1]
; GFX8-NEXT: flat_load_dword v2, v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v4
-; GFX8-NEXT: s_mov_b32 s0, 0x41200000
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_f32 v2, |v5|, v2, s0
@@ -1077,12 +1077,12 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias %
; GFX11-MAD: ; %bb.0:
; GFX11-MAD-NEXT: s_clause 0x1
; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-MAD-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-MAD-NEXT: s_clause 0x1
; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-MAD-NEXT: s_waitcnt vmcnt(0)
; GFX11-MAD-NEXT: v_mul_f32_e64 v1, |v1|, v2
; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -1125,12 +1125,12 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(ptr addrspace(1) noalias %
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fma_f32 v1, |v1|, v2, 0x41200000
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
@@ -1177,20 +1177,20 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias %
; GFX8-LABEL: no_madak_src1_modifier_f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; GFX8-NEXT: s_mov_b32 s0, 0x41200000
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dword v5, v[0:1]
; GFX8-NEXT: flat_load_dword v2, v[2:3]
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v4
-; GFX8-NEXT: s_mov_b32 s0, 0x41200000
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_f32 v2, v5, |v2|, s0
@@ -1230,12 +1230,12 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias %
; GFX11-MAD: ; %bb.0:
; GFX11-MAD-NEXT: s_clause 0x1
; GFX11-MAD-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-MAD-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-MAD-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-MAD-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-MAD-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-MAD-NEXT: s_clause 0x1
; GFX11-MAD-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-MAD-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-MAD-NEXT: s_waitcnt vmcnt(0)
; GFX11-MAD-NEXT: v_mul_f32_e64 v1, v1, |v2|
; GFX11-MAD-NEXT: s_delay_alu instid0(VALU_DEP_1)
@@ -1278,12 +1278,12 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(ptr addrspace(1) noalias %
; GFX11-FMA: ; %bb.0:
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-FMA-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-FMA-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-FMA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-FMA-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-FMA-NEXT: s_clause 0x1
; GFX11-FMA-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-FMA-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-FMA-NEXT: s_waitcnt vmcnt(0)
; GFX11-FMA-NEXT: v_fma_f32 v1, v1, |v2|, 0x41200000
; GFX11-FMA-NEXT: global_store_b32 v0, v1, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll b/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll
index 2b5d32fa7b977..e8c6baaddd16e 100644
--- a/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll
+++ b/llvm/test/CodeGen/AMDGPU/max-hard-clause-length.ll
@@ -397,82 +397,82 @@ define amdgpu_kernel void @long_store_chain(ptr addrspace(1) %p) {
define amdgpu_kernel void @long_load_chain(ptr addrspace(1) %p) {
; GFX10-LABEL: long_load_chain:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x3e
-; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0
-; GFX10-NEXT: s_load_dword s3, s[0:1], 0x10
-; GFX10-NEXT: s_load_dword s4, s[0:1], 0x20
-; GFX10-NEXT: s_load_dword s5, s[0:1], 0x30
-; GFX10-NEXT: s_load_dword s6, s[0:1], 0x40
-; GFX10-NEXT: s_load_dword s7, s[0:1], 0x50
-; GFX10-NEXT: s_load_dword s8, s[0:1], 0x60
-; GFX10-NEXT: s_load_dword s9, s[0:1], 0x70
-; GFX10-NEXT: s_load_dword s10, s[0:1], 0x80
-; GFX10-NEXT: s_load_dword s11, s[0:1], 0x90
-; GFX10-NEXT: s_load_dword s12, s[0:1], 0xa0
-; GFX10-NEXT: s_load_dword s13, s[0:1], 0xb0
-; GFX10-NEXT: s_load_dword s14, s[0:1], 0xc0
-; GFX10-NEXT: s_load_dword s15, s[0:1], 0xd0
-; GFX10-NEXT: s_load_dword s16, s[0:1], 0xe0
-; GFX10-NEXT: s_load_dword s17, s[0:1], 0xf0
-; GFX10-NEXT: s_load_dword s18, s[0:1], 0x100
-; GFX10-NEXT: s_load_dword s19, s[0:1], 0x110
-; GFX10-NEXT: s_load_dword s20, s[0:1], 0x120
-; GFX10-NEXT: s_load_dword s21, s[0:1], 0x130
-; GFX10-NEXT: s_load_dword s22, s[0:1], 0x140
-; GFX10-NEXT: s_load_dword s23, s[0:1], 0x150
-; GFX10-NEXT: s_load_dword s24, s[0:1], 0x160
-; GFX10-NEXT: s_load_dword s25, s[0:1], 0x170
-; GFX10-NEXT: s_load_dword s26, s[0:1], 0x180
-; GFX10-NEXT: s_load_dword s27, s[0:1], 0x190
-; GFX10-NEXT: s_load_dword s28, s[0:1], 0x1a0
-; GFX10-NEXT: s_load_dword s29, s[0:1], 0x1b0
-; GFX10-NEXT: s_load_dword s30, s[0:1], 0x1c0
-; GFX10-NEXT: s_load_dword s31, s[0:1], 0x1d0
-; GFX10-NEXT: s_load_dword s33, s[0:1], 0x1e0
-; GFX10-NEXT: s_load_dword s34, s[0:1], 0x1f0
-; GFX10-NEXT: s_load_dword s35, s[0:1], 0x200
-; GFX10-NEXT: s_load_dword s36, s[0:1], 0x210
-; GFX10-NEXT: s_load_dword s37, s[0:1], 0x220
-; GFX10-NEXT: s_load_dword s38, s[0:1], 0x230
-; GFX10-NEXT: s_load_dword s39, s[0:1], 0x240
-; GFX10-NEXT: s_load_dword s40, s[0:1], 0x250
-; GFX10-NEXT: s_load_dword s41, s[0:1], 0x260
-; GFX10-NEXT: s_load_dword s42, s[0:1], 0x270
-; GFX10-NEXT: s_load_dword s43, s[0:1], 0x280
-; GFX10-NEXT: s_load_dword s44, s[0:1], 0x290
-; GFX10-NEXT: s_load_dword s45, s[0:1], 0x2a0
-; GFX10-NEXT: s_load_dword s46, s[0:1], 0x2b0
-; GFX10-NEXT: s_load_dword s47, s[0:1], 0x2c0
-; GFX10-NEXT: s_load_dword s48, s[0:1], 0x2d0
-; GFX10-NEXT: s_load_dword s49, s[0:1], 0x2e0
-; GFX10-NEXT: s_load_dword s50, s[0:1], 0x2f0
-; GFX10-NEXT: s_load_dword s51, s[0:1], 0x300
-; GFX10-NEXT: s_load_dword s52, s[0:1], 0x310
-; GFX10-NEXT: s_load_dword s53, s[0:1], 0x320
-; GFX10-NEXT: s_load_dword s54, s[0:1], 0x330
-; GFX10-NEXT: s_load_dword s55, s[0:1], 0x340
-; GFX10-NEXT: s_load_dword s56, s[0:1], 0x350
-; GFX10-NEXT: s_load_dword s57, s[0:1], 0x360
-; GFX10-NEXT: s_load_dword s58, s[0:1], 0x370
-; GFX10-NEXT: s_load_dword s59, s[0:1], 0x380
-; GFX10-NEXT: s_load_dword s60, s[0:1], 0x390
-; GFX10-NEXT: s_load_dword s61, s[0:1], 0x3a0
-; GFX10-NEXT: s_load_dword s62, s[0:1], 0x3b0
-; GFX10-NEXT: s_load_dword s63, s[0:1], 0x3c0
-; GFX10-NEXT: s_load_dword s64, s[0:1], 0x3d0
-; GFX10-NEXT: s_load_dword s65, s[0:1], 0x3e0
+; GFX10-NEXT: s_load_dword s0, s[2:3], 0x0
+; GFX10-NEXT: s_load_dword s1, s[2:3], 0x10
+; GFX10-NEXT: s_load_dword s4, s[2:3], 0x20
+; GFX10-NEXT: s_load_dword s5, s[2:3], 0x30
+; GFX10-NEXT: s_load_dword s6, s[2:3], 0x40
+; GFX10-NEXT: s_load_dword s7, s[2:3], 0x50
+; GFX10-NEXT: s_load_dword s8, s[2:3], 0x60
+; GFX10-NEXT: s_load_dword s9, s[2:3], 0x70
+; GFX10-NEXT: s_load_dword s10, s[2:3], 0x80
+; GFX10-NEXT: s_load_dword s11, s[2:3], 0x90
+; GFX10-NEXT: s_load_dword s12, s[2:3], 0xa0
+; GFX10-NEXT: s_load_dword s13, s[2:3], 0xb0
+; GFX10-NEXT: s_load_dword s14, s[2:3], 0xc0
+; GFX10-NEXT: s_load_dword s15, s[2:3], 0xd0
+; GFX10-NEXT: s_load_dword s16, s[2:3], 0xe0
+; GFX10-NEXT: s_load_dword s17, s[2:3], 0xf0
+; GFX10-NEXT: s_load_dword s18, s[2:3], 0x100
+; GFX10-NEXT: s_load_dword s19, s[2:3], 0x110
+; GFX10-NEXT: s_load_dword s20, s[2:3], 0x120
+; GFX10-NEXT: s_load_dword s21, s[2:3], 0x130
+; GFX10-NEXT: s_load_dword s22, s[2:3], 0x140
+; GFX10-NEXT: s_load_dword s23, s[2:3], 0x150
+; GFX10-NEXT: s_load_dword s24, s[2:3], 0x160
+; GFX10-NEXT: s_load_dword s25, s[2:3], 0x170
+; GFX10-NEXT: s_load_dword s26, s[2:3], 0x180
+; GFX10-NEXT: s_load_dword s27, s[2:3], 0x190
+; GFX10-NEXT: s_load_dword s28, s[2:3], 0x1a0
+; GFX10-NEXT: s_load_dword s29, s[2:3], 0x1b0
+; GFX10-NEXT: s_load_dword s30, s[2:3], 0x1c0
+; GFX10-NEXT: s_load_dword s31, s[2:3], 0x1d0
+; GFX10-NEXT: s_load_dword s33, s[2:3], 0x1e0
+; GFX10-NEXT: s_load_dword s34, s[2:3], 0x1f0
+; GFX10-NEXT: s_load_dword s35, s[2:3], 0x200
+; GFX10-NEXT: s_load_dword s36, s[2:3], 0x210
+; GFX10-NEXT: s_load_dword s37, s[2:3], 0x220
+; GFX10-NEXT: s_load_dword s38, s[2:3], 0x230
+; GFX10-NEXT: s_load_dword s39, s[2:3], 0x240
+; GFX10-NEXT: s_load_dword s40, s[2:3], 0x250
+; GFX10-NEXT: s_load_dword s41, s[2:3], 0x260
+; GFX10-NEXT: s_load_dword s42, s[2:3], 0x270
+; GFX10-NEXT: s_load_dword s43, s[2:3], 0x280
+; GFX10-NEXT: s_load_dword s44, s[2:3], 0x290
+; GFX10-NEXT: s_load_dword s45, s[2:3], 0x2a0
+; GFX10-NEXT: s_load_dword s46, s[2:3], 0x2b0
+; GFX10-NEXT: s_load_dword s47, s[2:3], 0x2c0
+; GFX10-NEXT: s_load_dword s48, s[2:3], 0x2d0
+; GFX10-NEXT: s_load_dword s49, s[2:3], 0x2e0
+; GFX10-NEXT: s_load_dword s50, s[2:3], 0x2f0
+; GFX10-NEXT: s_load_dword s51, s[2:3], 0x300
+; GFX10-NEXT: s_load_dword s52, s[2:3], 0x310
+; GFX10-NEXT: s_load_dword s53, s[2:3], 0x320
+; GFX10-NEXT: s_load_dword s54, s[2:3], 0x330
+; GFX10-NEXT: s_load_dword s55, s[2:3], 0x340
+; GFX10-NEXT: s_load_dword s56, s[2:3], 0x350
+; GFX10-NEXT: s_load_dword s57, s[2:3], 0x360
+; GFX10-NEXT: s_load_dword s58, s[2:3], 0x370
+; GFX10-NEXT: s_load_dword s59, s[2:3], 0x380
+; GFX10-NEXT: s_load_dword s60, s[2:3], 0x390
+; GFX10-NEXT: s_load_dword s61, s[2:3], 0x3a0
+; GFX10-NEXT: s_load_dword s62, s[2:3], 0x3b0
+; GFX10-NEXT: s_load_dword s63, s[2:3], 0x3c0
+; GFX10-NEXT: s_load_dword s64, s[2:3], 0x3d0
+; GFX10-NEXT: s_load_dword s65, s[2:3], 0x3e0
; GFX10-NEXT: s_clause 0x2
-; GFX10-NEXT: s_load_dword s66, s[0:1], 0x3f0
-; GFX10-NEXT: s_load_dword s67, s[0:1], 0x400
-; GFX10-NEXT: s_load_dword s0, s[0:1], 0x410
+; GFX10-NEXT: s_load_dword s66, s[2:3], 0x3f0
+; GFX10-NEXT: s_load_dword s67, s[2:3], 0x400
+; GFX10-NEXT: s_load_dword s2, s[2:3], 0x410
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: ;;#ASMSTART
-; GFX10-NEXT: ; use s2
+; GFX10-NEXT: ; use s0
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: ;;#ASMSTART
-; GFX10-NEXT: ; use s3
+; GFX10-NEXT: ; use s1
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: ;;#ASMSTART
; GFX10-NEXT: ; use s4
@@ -664,89 +664,89 @@ define amdgpu_kernel void @long_load_chain(ptr addrspace(1) %p) {
; GFX10-NEXT: ; use s67
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: ;;#ASMSTART
-; GFX10-NEXT: ; use s0
+; GFX10-NEXT: ; use s2
; GFX10-NEXT: ;;#ASMEND
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: long_load_chain:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x0
-; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x10
-; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x20
-; GFX11-NEXT: s_load_b32 s5, s[0:1], 0x30
-; GFX11-NEXT: s_load_b32 s6, s[0:1], 0x40
-; GFX11-NEXT: s_load_b32 s7, s[0:1], 0x50
-; GFX11-NEXT: s_load_b32 s8, s[0:1], 0x60
-; GFX11-NEXT: s_load_b32 s9, s[0:1], 0x70
-; GFX11-NEXT: s_load_b32 s10, s[0:1], 0x80
-; GFX11-NEXT: s_load_b32 s11, s[0:1], 0x90
-; GFX11-NEXT: s_load_b32 s12, s[0:1], 0xa0
-; GFX11-NEXT: s_load_b32 s13, s[0:1], 0xb0
-; GFX11-NEXT: s_load_b32 s14, s[0:1], 0xc0
-; GFX11-NEXT: s_load_b32 s15, s[0:1], 0xd0
-; GFX11-NEXT: s_load_b32 s16, s[0:1], 0xe0
-; GFX11-NEXT: s_load_b32 s17, s[0:1], 0xf0
-; GFX11-NEXT: s_load_b32 s18, s[0:1], 0x100
-; GFX11-NEXT: s_load_b32 s19, s[0:1], 0x110
-; GFX11-NEXT: s_load_b32 s20, s[0:1], 0x120
-; GFX11-NEXT: s_load_b32 s21, s[0:1], 0x130
-; GFX11-NEXT: s_load_b32 s22, s[0:1], 0x140
-; GFX11-NEXT: s_load_b32 s23, s[0:1], 0x150
-; GFX11-NEXT: s_load_b32 s24, s[0:1], 0x160
-; GFX11-NEXT: s_load_b32 s25, s[0:1], 0x170
-; GFX11-NEXT: s_load_b32 s26, s[0:1], 0x180
-; GFX11-NEXT: s_load_b32 s27, s[0:1], 0x190
-; GFX11-NEXT: s_load_b32 s28, s[0:1], 0x1a0
-; GFX11-NEXT: s_load_b32 s29, s[0:1], 0x1b0
-; GFX11-NEXT: s_load_b32 s30, s[0:1], 0x1c0
-; GFX11-NEXT: s_load_b32 s31, s[0:1], 0x1d0
-; GFX11-NEXT: s_load_b32 s33, s[0:1], 0x1e0
-; GFX11-NEXT: s_load_b32 s34, s[0:1], 0x1f0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x10
+; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x20
+; GFX11-NEXT: s_load_b32 s5, s[2:3], 0x30
+; GFX11-NEXT: s_load_b32 s6, s[2:3], 0x40
+; GFX11-NEXT: s_load_b32 s7, s[2:3], 0x50
+; GFX11-NEXT: s_load_b32 s8, s[2:3], 0x60
+; GFX11-NEXT: s_load_b32 s9, s[2:3], 0x70
+; GFX11-NEXT: s_load_b32 s10, s[2:3], 0x80
+; GFX11-NEXT: s_load_b32 s11, s[2:3], 0x90
+; GFX11-NEXT: s_load_b32 s12, s[2:3], 0xa0
+; GFX11-NEXT: s_load_b32 s13, s[2:3], 0xb0
+; GFX11-NEXT: s_load_b32 s14, s[2:3], 0xc0
+; GFX11-NEXT: s_load_b32 s15, s[2:3], 0xd0
+; GFX11-NEXT: s_load_b32 s16, s[2:3], 0xe0
+; GFX11-NEXT: s_load_b32 s17, s[2:3], 0xf0
+; GFX11-NEXT: s_load_b32 s18, s[2:3], 0x100
+; GFX11-NEXT: s_load_b32 s19, s[2:3], 0x110
+; GFX11-NEXT: s_load_b32 s20, s[2:3], 0x120
+; GFX11-NEXT: s_load_b32 s21, s[2:3], 0x130
+; GFX11-NEXT: s_load_b32 s22, s[2:3], 0x140
+; GFX11-NEXT: s_load_b32 s23, s[2:3], 0x150
+; GFX11-NEXT: s_load_b32 s24, s[2:3], 0x160
+; GFX11-NEXT: s_load_b32 s25, s[2:3], 0x170
+; GFX11-NEXT: s_load_b32 s26, s[2:3], 0x180
+; GFX11-NEXT: s_load_b32 s27, s[2:3], 0x190
+; GFX11-NEXT: s_load_b32 s28, s[2:3], 0x1a0
+; GFX11-NEXT: s_load_b32 s29, s[2:3], 0x1b0
+; GFX11-NEXT: s_load_b32 s30, s[2:3], 0x1c0
+; GFX11-NEXT: s_load_b32 s31, s[2:3], 0x1d0
+; GFX11-NEXT: s_load_b32 s33, s[2:3], 0x1e0
+; GFX11-NEXT: s_load_b32 s34, s[2:3], 0x1f0
; GFX11-NEXT: s_clause 0x1f
-; GFX11-NEXT: s_load_b32 s35, s[0:1], 0x200
-; GFX11-NEXT: s_load_b32 s36, s[0:1], 0x210
-; GFX11-NEXT: s_load_b32 s37, s[0:1], 0x220
-; GFX11-NEXT: s_load_b32 s38, s[0:1], 0x230
-; GFX11-NEXT: s_load_b32 s39, s[0:1], 0x240
-; GFX11-NEXT: s_load_b32 s40, s[0:1], 0x250
-; GFX11-NEXT: s_load_b32 s41, s[0:1], 0x260
-; GFX11-NEXT: s_load_b32 s42, s[0:1], 0x270
-; GFX11-NEXT: s_load_b32 s43, s[0:1], 0x280
-; GFX11-NEXT: s_load_b32 s44, s[0:1], 0x290
-; GFX11-NEXT: s_load_b32 s45, s[0:1], 0x2a0
-; GFX11-NEXT: s_load_b32 s46, s[0:1], 0x2b0
-; GFX11-NEXT: s_load_b32 s47, s[0:1], 0x2c0
-; GFX11-NEXT: s_load_b32 s48, s[0:1], 0x2d0
-; GFX11-NEXT: s_load_b32 s49, s[0:1], 0x2e0
-; GFX11-NEXT: s_load_b32 s50, s[0:1], 0x2f0
-; GFX11-NEXT: s_load_b32 s51, s[0:1], 0x300
-; GFX11-NEXT: s_load_b32 s52, s[0:1], 0x310
-; GFX11-NEXT: s_load_b32 s53, s[0:1], 0x320
-; GFX11-NEXT: s_load_b32 s54, s[0:1], 0x330
-; GFX11-NEXT: s_load_b32 s55, s[0:1], 0x340
-; GFX11-NEXT: s_load_b32 s56, s[0:1], 0x350
-; GFX11-NEXT: s_load_b32 s57, s[0:1], 0x360
-; GFX11-NEXT: s_load_b32 s58, s[0:1], 0x370
-; GFX11-NEXT: s_load_b32 s59, s[0:1], 0x380
-; GFX11-NEXT: s_load_b32 s60, s[0:1], 0x390
-; GFX11-NEXT: s_load_b32 s61, s[0:1], 0x3a0
-; GFX11-NEXT: s_load_b32 s62, s[0:1], 0x3b0
-; GFX11-NEXT: s_load_b32 s63, s[0:1], 0x3c0
-; GFX11-NEXT: s_load_b32 s64, s[0:1], 0x3d0
-; GFX11-NEXT: s_load_b32 s65, s[0:1], 0x3e0
-; GFX11-NEXT: s_load_b32 s66, s[0:1], 0x3f0
+; GFX11-NEXT: s_load_b32 s35, s[2:3], 0x200
+; GFX11-NEXT: s_load_b32 s36, s[2:3], 0x210
+; GFX11-NEXT: s_load_b32 s37, s[2:3], 0x220
+; GFX11-NEXT: s_load_b32 s38, s[2:3], 0x230
+; GFX11-NEXT: s_load_b32 s39, s[2:3], 0x240
+; GFX11-NEXT: s_load_b32 s40, s[2:3], 0x250
+; GFX11-NEXT: s_load_b32 s41, s[2:3], 0x260
+; GFX11-NEXT: s_load_b32 s42, s[2:3], 0x270
+; GFX11-NEXT: s_load_b32 s43, s[2:3], 0x280
+; GFX11-NEXT: s_load_b32 s44, s[2:3], 0x290
+; GFX11-NEXT: s_load_b32 s45, s[2:3], 0x2a0
+; GFX11-NEXT: s_load_b32 s46, s[2:3], 0x2b0
+; GFX11-NEXT: s_load_b32 s47, s[2:3], 0x2c0
+; GFX11-NEXT: s_load_b32 s48, s[2:3], 0x2d0
+; GFX11-NEXT: s_load_b32 s49, s[2:3], 0x2e0
+; GFX11-NEXT: s_load_b32 s50, s[2:3], 0x2f0
+; GFX11-NEXT: s_load_b32 s51, s[2:3], 0x300
+; GFX11-NEXT: s_load_b32 s52, s[2:3], 0x310
+; GFX11-NEXT: s_load_b32 s53, s[2:3], 0x320
+; GFX11-NEXT: s_load_b32 s54, s[2:3], 0x330
+; GFX11-NEXT: s_load_b32 s55, s[2:3], 0x340
+; GFX11-NEXT: s_load_b32 s56, s[2:3], 0x350
+; GFX11-NEXT: s_load_b32 s57, s[2:3], 0x360
+; GFX11-NEXT: s_load_b32 s58, s[2:3], 0x370
+; GFX11-NEXT: s_load_b32 s59, s[2:3], 0x380
+; GFX11-NEXT: s_load_b32 s60, s[2:3], 0x390
+; GFX11-NEXT: s_load_b32 s61, s[2:3], 0x3a0
+; GFX11-NEXT: s_load_b32 s62, s[2:3], 0x3b0
+; GFX11-NEXT: s_load_b32 s63, s[2:3], 0x3c0
+; GFX11-NEXT: s_load_b32 s64, s[2:3], 0x3d0
+; GFX11-NEXT: s_load_b32 s65, s[2:3], 0x3e0
+; GFX11-NEXT: s_load_b32 s66, s[2:3], 0x3f0
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s67, s[0:1], 0x400
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x410
+; GFX11-NEXT: s_load_b32 s67, s[2:3], 0x400
+; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x410
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use s2
+; GFX11-NEXT: ; use s0
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use s3
+; GFX11-NEXT: ; use s1
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: ;;#ASMSTART
; GFX11-NEXT: ; use s4
@@ -938,89 +938,89 @@ define amdgpu_kernel void @long_load_chain(ptr addrspace(1) %p) {
; GFX11-NEXT: ; use s67
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: ;;#ASMSTART
-; GFX11-NEXT: ; use s0
+; GFX11-NEXT: ; use s2
; GFX11-NEXT: ;;#ASMEND
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: long_load_chain:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_clause 0x1f
-; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x0
-; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x10
-; GFX12-NEXT: s_load_b32 s4, s[0:1], 0x20
-; GFX12-NEXT: s_load_b32 s5, s[0:1], 0x30
-; GFX12-NEXT: s_load_b32 s6, s[0:1], 0x40
-; GFX12-NEXT: s_load_b32 s7, s[0:1], 0x50
-; GFX12-NEXT: s_load_b32 s8, s[0:1], 0x60
-; GFX12-NEXT: s_load_b32 s9, s[0:1], 0x70
-; GFX12-NEXT: s_load_b32 s10, s[0:1], 0x80
-; GFX12-NEXT: s_load_b32 s11, s[0:1], 0x90
-; GFX12-NEXT: s_load_b32 s12, s[0:1], 0xa0
-; GFX12-NEXT: s_load_b32 s13, s[0:1], 0xb0
-; GFX12-NEXT: s_load_b32 s14, s[0:1], 0xc0
-; GFX12-NEXT: s_load_b32 s15, s[0:1], 0xd0
-; GFX12-NEXT: s_load_b32 s16, s[0:1], 0xe0
-; GFX12-NEXT: s_load_b32 s17, s[0:1], 0xf0
-; GFX12-NEXT: s_load_b32 s18, s[0:1], 0x100
-; GFX12-NEXT: s_load_b32 s19, s[0:1], 0x110
-; GFX12-NEXT: s_load_b32 s20, s[0:1], 0x120
-; GFX12-NEXT: s_load_b32 s21, s[0:1], 0x130
-; GFX12-NEXT: s_load_b32 s22, s[0:1], 0x140
-; GFX12-NEXT: s_load_b32 s23, s[0:1], 0x150
-; GFX12-NEXT: s_load_b32 s24, s[0:1], 0x160
-; GFX12-NEXT: s_load_b32 s25, s[0:1], 0x170
-; GFX12-NEXT: s_load_b32 s26, s[0:1], 0x180
-; GFX12-NEXT: s_load_b32 s27, s[0:1], 0x190
-; GFX12-NEXT: s_load_b32 s28, s[0:1], 0x1a0
-; GFX12-NEXT: s_load_b32 s29, s[0:1], 0x1b0
-; GFX12-NEXT: s_load_b32 s30, s[0:1], 0x1c0
-; GFX12-NEXT: s_load_b32 s31, s[0:1], 0x1d0
-; GFX12-NEXT: s_load_b32 s33, s[0:1], 0x1e0
-; GFX12-NEXT: s_load_b32 s34, s[0:1], 0x1f0
+; GFX12-NEXT: s_load_b32 s0, s[2:3], 0x0
+; GFX12-NEXT: s_load_b32 s1, s[2:3], 0x10
+; GFX12-NEXT: s_load_b32 s4, s[2:3], 0x20
+; GFX12-NEXT: s_load_b32 s5, s[2:3], 0x30
+; GFX12-NEXT: s_load_b32 s6, s[2:3], 0x40
+; GFX12-NEXT: s_load_b32 s7, s[2:3], 0x50
+; GFX12-NEXT: s_load_b32 s8, s[2:3], 0x60
+; GFX12-NEXT: s_load_b32 s9, s[2:3], 0x70
+; GFX12-NEXT: s_load_b32 s10, s[2:3], 0x80
+; GFX12-NEXT: s_load_b32 s11, s[2:3], 0x90
+; GFX12-NEXT: s_load_b32 s12, s[2:3], 0xa0
+; GFX12-NEXT: s_load_b32 s13, s[2:3], 0xb0
+; GFX12-NEXT: s_load_b32 s14, s[2:3], 0xc0
+; GFX12-NEXT: s_load_b32 s15, s[2:3], 0xd0
+; GFX12-NEXT: s_load_b32 s16, s[2:3], 0xe0
+; GFX12-NEXT: s_load_b32 s17, s[2:3], 0xf0
+; GFX12-NEXT: s_load_b32 s18, s[2:3], 0x100
+; GFX12-NEXT: s_load_b32 s19, s[2:3], 0x110
+; GFX12-NEXT: s_load_b32 s20, s[2:3], 0x120
+; GFX12-NEXT: s_load_b32 s21, s[2:3], 0x130
+; GFX12-NEXT: s_load_b32 s22, s[2:3], 0x140
+; GFX12-NEXT: s_load_b32 s23, s[2:3], 0x150
+; GFX12-NEXT: s_load_b32 s24, s[2:3], 0x160
+; GFX12-NEXT: s_load_b32 s25, s[2:3], 0x170
+; GFX12-NEXT: s_load_b32 s26, s[2:3], 0x180
+; GFX12-NEXT: s_load_b32 s27, s[2:3], 0x190
+; GFX12-NEXT: s_load_b32 s28, s[2:3], 0x1a0
+; GFX12-NEXT: s_load_b32 s29, s[2:3], 0x1b0
+; GFX12-NEXT: s_load_b32 s30, s[2:3], 0x1c0
+; GFX12-NEXT: s_load_b32 s31, s[2:3], 0x1d0
+; GFX12-NEXT: s_load_b32 s33, s[2:3], 0x1e0
+; GFX12-NEXT: s_load_b32 s34, s[2:3], 0x1f0
; GFX12-NEXT: s_clause 0x1f
-; GFX12-NEXT: s_load_b32 s35, s[0:1], 0x200
-; GFX12-NEXT: s_load_b32 s36, s[0:1], 0x210
-; GFX12-NEXT: s_load_b32 s37, s[0:1], 0x220
-; GFX12-NEXT: s_load_b32 s38, s[0:1], 0x230
-; GFX12-NEXT: s_load_b32 s39, s[0:1], 0x240
-; GFX12-NEXT: s_load_b32 s40, s[0:1], 0x250
-; GFX12-NEXT: s_load_b32 s41, s[0:1], 0x260
-; GFX12-NEXT: s_load_b32 s42, s[0:1], 0x270
-; GFX12-NEXT: s_load_b32 s43, s[0:1], 0x280
-; GFX12-NEXT: s_load_b32 s44, s[0:1], 0x290
-; GFX12-NEXT: s_load_b32 s45, s[0:1], 0x2a0
-; GFX12-NEXT: s_load_b32 s46, s[0:1], 0x2b0
-; GFX12-NEXT: s_load_b32 s47, s[0:1], 0x2c0
-; GFX12-NEXT: s_load_b32 s48, s[0:1], 0x2d0
-; GFX12-NEXT: s_load_b32 s49, s[0:1], 0x2e0
-; GFX12-NEXT: s_load_b32 s50, s[0:1], 0x2f0
-; GFX12-NEXT: s_load_b32 s51, s[0:1], 0x300
-; GFX12-NEXT: s_load_b32 s52, s[0:1], 0x310
-; GFX12-NEXT: s_load_b32 s53, s[0:1], 0x320
-; GFX12-NEXT: s_load_b32 s54, s[0:1], 0x330
-; GFX12-NEXT: s_load_b32 s55, s[0:1], 0x340
-; GFX12-NEXT: s_load_b32 s56, s[0:1], 0x350
-; GFX12-NEXT: s_load_b32 s57, s[0:1], 0x360
-; GFX12-NEXT: s_load_b32 s58, s[0:1], 0x370
-; GFX12-NEXT: s_load_b32 s59, s[0:1], 0x380
-; GFX12-NEXT: s_load_b32 s60, s[0:1], 0x390
-; GFX12-NEXT: s_load_b32 s61, s[0:1], 0x3a0
-; GFX12-NEXT: s_load_b32 s62, s[0:1], 0x3b0
-; GFX12-NEXT: s_load_b32 s63, s[0:1], 0x3c0
-; GFX12-NEXT: s_load_b32 s64, s[0:1], 0x3d0
-; GFX12-NEXT: s_load_b32 s65, s[0:1], 0x3e0
-; GFX12-NEXT: s_load_b32 s66, s[0:1], 0x3f0
+; GFX12-NEXT: s_load_b32 s35, s[2:3], 0x200
+; GFX12-NEXT: s_load_b32 s36, s[2:3], 0x210
+; GFX12-NEXT: s_load_b32 s37, s[2:3], 0x220
+; GFX12-NEXT: s_load_b32 s38, s[2:3], 0x230
+; GFX12-NEXT: s_load_b32 s39, s[2:3], 0x240
+; GFX12-NEXT: s_load_b32 s40, s[2:3], 0x250
+; GFX12-NEXT: s_load_b32 s41, s[2:3], 0x260
+; GFX12-NEXT: s_load_b32 s42, s[2:3], 0x270
+; GFX12-NEXT: s_load_b32 s43, s[2:3], 0x280
+; GFX12-NEXT: s_load_b32 s44, s[2:3], 0x290
+; GFX12-NEXT: s_load_b32 s45, s[2:3], 0x2a0
+; GFX12-NEXT: s_load_b32 s46, s[2:3], 0x2b0
+; GFX12-NEXT: s_load_b32 s47, s[2:3], 0x2c0
+; GFX12-NEXT: s_load_b32 s48, s[2:3], 0x2d0
+; GFX12-NEXT: s_load_b32 s49, s[2:3], 0x2e0
+; GFX12-NEXT: s_load_b32 s50, s[2:3], 0x2f0
+; GFX12-NEXT: s_load_b32 s51, s[2:3], 0x300
+; GFX12-NEXT: s_load_b32 s52, s[2:3], 0x310
+; GFX12-NEXT: s_load_b32 s53, s[2:3], 0x320
+; GFX12-NEXT: s_load_b32 s54, s[2:3], 0x330
+; GFX12-NEXT: s_load_b32 s55, s[2:3], 0x340
+; GFX12-NEXT: s_load_b32 s56, s[2:3], 0x350
+; GFX12-NEXT: s_load_b32 s57, s[2:3], 0x360
+; GFX12-NEXT: s_load_b32 s58, s[2:3], 0x370
+; GFX12-NEXT: s_load_b32 s59, s[2:3], 0x380
+; GFX12-NEXT: s_load_b32 s60, s[2:3], 0x390
+; GFX12-NEXT: s_load_b32 s61, s[2:3], 0x3a0
+; GFX12-NEXT: s_load_b32 s62, s[2:3], 0x3b0
+; GFX12-NEXT: s_load_b32 s63, s[2:3], 0x3c0
+; GFX12-NEXT: s_load_b32 s64, s[2:3], 0x3d0
+; GFX12-NEXT: s_load_b32 s65, s[2:3], 0x3e0
+; GFX12-NEXT: s_load_b32 s66, s[2:3], 0x3f0
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b32 s67, s[0:1], 0x400
-; GFX12-NEXT: s_load_b32 s0, s[0:1], 0x410
+; GFX12-NEXT: s_load_b32 s67, s[2:3], 0x400
+; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x410
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: ;;#ASMSTART
-; GFX12-NEXT: ; use s2
+; GFX12-NEXT: ; use s0
; GFX12-NEXT: ;;#ASMEND
; GFX12-NEXT: ;;#ASMSTART
-; GFX12-NEXT: ; use s3
+; GFX12-NEXT: ; use s1
; GFX12-NEXT: ;;#ASMEND
; GFX12-NEXT: ;;#ASMSTART
; GFX12-NEXT: ; use s4
@@ -1212,7 +1212,7 @@ define amdgpu_kernel void @long_load_chain(ptr addrspace(1) %p) {
; GFX12-NEXT: ; use s67
; GFX12-NEXT: ;;#ASMEND
; GFX12-NEXT: ;;#ASMSTART
-; GFX12-NEXT: ; use s0
+; GFX12-NEXT: ; use s2
; GFX12-NEXT: ;;#ASMEND
; GFX12-NEXT: s_endpgm
%v0 = load i32, ptr addrspace(1) %p
diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll
index 8ef2ca2765e8a..920b6cc962adb 100644
--- a/llvm/test/CodeGen/AMDGPU/max.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll
@@ -7,14 +7,14 @@ define amdgpu_kernel void @v_test_imax_sge_i16(ptr addrspace(1) %out, ptr addrsp
; VI-LABEL: v_test_imax_sge_i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_ushort v5, v[0:1]
; VI-NEXT: flat_load_ushort v2, v[2:3]
@@ -55,14 +55,14 @@ define amdgpu_kernel void @v_test_imax_sge_v2i16(ptr addrspace(1) %out, ptr addr
; VI-LABEL: v_test_imax_sge_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
@@ -105,14 +105,14 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(ptr addrspace(1) %out, ptr addr
; VI-LABEL: v_test_imax_sge_v3i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v6, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v6
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v6
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
@@ -176,14 +176,14 @@ define amdgpu_kernel void @v_test_imax_sge_v4i16(ptr addrspace(1) %out, ptr addr
; VI-LABEL: v_test_imax_sge_v4i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
@@ -230,14 +230,14 @@ define amdgpu_kernel void @v_test_imax_sgt_i16(ptr addrspace(1) %out, ptr addrsp
; VI-LABEL: v_test_imax_sgt_i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_ushort v5, v[0:1]
; VI-NEXT: flat_load_ushort v2, v[2:3]
@@ -278,14 +278,14 @@ define amdgpu_kernel void @v_test_umax_uge_i16(ptr addrspace(1) %out, ptr addrsp
; VI-LABEL: v_test_umax_uge_i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_ushort v5, v[0:1]
; VI-NEXT: flat_load_ushort v2, v[2:3]
@@ -326,14 +326,14 @@ define amdgpu_kernel void @v_test_umax_ugt_i16(ptr addrspace(1) %out, ptr addrsp
; VI-LABEL: v_test_umax_ugt_i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_ushort v5, v[0:1]
; VI-NEXT: flat_load_ushort v2, v[2:3]
@@ -373,14 +373,14 @@ define amdgpu_kernel void @v_test_umax_ugt_v2i16(ptr addrspace(1) %out, ptr addr
; VI-LABEL: v_test_umax_ugt_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
index 5c88328b6dd91..51b6410d606d4 100644
--- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
@@ -267,9 +267,10 @@ define amdgpu_kernel void @vector_clause_indirect(ptr addrspace(1) noalias nocap
;
; GCN-SCRATCH-LABEL: vector_clause_indirect:
; GCN-SCRATCH: ; %bb.0: ; %bb
+; GCN-SCRATCH-NEXT: s_clause 0x1
; GCN-SCRATCH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
; GCN-SCRATCH-NEXT: v_lshlrev_b32_e32 v0, 3, v0
-; GCN-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GCN-SCRATCH-NEXT: v_mov_b32_e32 v8, 0
; GCN-SCRATCH-NEXT: s_waitcnt lgkmcnt(0)
; GCN-SCRATCH-NEXT: global_load_dwordx2 v[4:5], v0, s[2:3]
@@ -278,9 +279,9 @@ define amdgpu_kernel void @vector_clause_indirect(ptr addrspace(1) noalias nocap
; GCN-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[4:5], off
; GCN-SCRATCH-NEXT: global_load_dwordx4 v[4:7], v[4:5], off offset:16
; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(1)
-; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5]
; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0)
-; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GCN-SCRATCH-NEXT: global_store_dwordx4 v8, v[4:7], s[4:5] offset:16
; GCN-SCRATCH-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -416,22 +417,22 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc
; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
; GCN-SCRATCH-NEXT: s_clause 0x1
-; GCN-SCRATCH-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x24
-; GCN-SCRATCH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x44
+; GCN-SCRATCH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN-SCRATCH-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
; GCN-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x40b00000
-; GCN-SCRATCH-NEXT: s_brev_b32 s8, 1
-; GCN-SCRATCH-NEXT: s_mov_b32 s9, s8
+; GCN-SCRATCH-NEXT: s_brev_b32 s0, 1
+; GCN-SCRATCH-NEXT: s_mov_b32 s1, s0
; GCN-SCRATCH-NEXT: scratch_store_dword off, v0, off
; GCN-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
; GCN-SCRATCH-NEXT: ;;#ASMSTART
; GCN-SCRATCH-NEXT: ;;#ASMEND
; GCN-SCRATCH-NEXT: scratch_load_dword v2, off, off
; GCN-SCRATCH-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-SCRATCH-NEXT: v_mov_b32_e32 v0, s10
-; GCN-SCRATCH-NEXT: v_mov_b32_e32 v1, s11
-; GCN-SCRATCH-NEXT: s_mov_b32 s11, 0
-; GCN-SCRATCH-NEXT: s_mov_b32 s10, s8
-; GCN-SCRATCH-NEXT: image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GCN-SCRATCH-NEXT: v_mov_b32_e32 v0, s2
+; GCN-SCRATCH-NEXT: v_mov_b32_e32 v1, s3
+; GCN-SCRATCH-NEXT: s_mov_b32 s3, 0
+; GCN-SCRATCH-NEXT: s_mov_b32 s2, s0
+; GCN-SCRATCH-NEXT: image_sample v0, v[0:1], s[4:11], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0)
; GCN-SCRATCH-NEXT: v_add_f32_e32 v0, v2, v0
; GCN-SCRATCH-NEXT: exp mrt0 v0, off, off, off done vm
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index 2334543157b6e..95866845f56b0 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -2478,20 +2478,20 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0,
;
; VI-LABEL: v_test_umin_ult_i32_multi_use:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0
+; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s4, s[4:5], 0x0
-; VI-NEXT: s_load_dword s5, s[6:7], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: s_load_dword s2, s[12:13], 0x0
+; VI-NEXT: s_load_dword s3, s[14:15], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_cmp_lt_u32 s4, s5
+; VI-NEXT: s_cmp_lt_u32 s2, s3
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1]
; VI-NEXT: s_and_b64 s[0:1], s[0:1], exec
-; VI-NEXT: s_cselect_b32 s0, s4, s5
+; VI-NEXT: s_cselect_b32 s0, s2, s3
; VI-NEXT: v_mov_b32_e32 v5, s0
; VI-NEXT: flat_store_dword v[0:1], v5
; VI-NEXT: flat_store_byte v[2:3], v4
@@ -2499,58 +2499,58 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0,
;
; GFX9-LABEL: v_test_umin_ult_i32_multi_use:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0
+; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_load_dword s8, s[4:5], 0x0
-; GFX9-NEXT: s_load_dword s9, s[6:7], 0x0
+; GFX9-NEXT: s_load_dword s2, s[12:13], 0x0
+; GFX9-NEXT: s_load_dword s3, s[14:15], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_cmp_lt_u32 s8, s9
-; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5]
-; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], exec
-; GFX9-NEXT: s_cselect_b32 s4, s8, s9
-; GFX9-NEXT: v_mov_b32_e32 v2, s4
-; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
-; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
+; GFX9-NEXT: s_cmp_lt_u32 s2, s3
+; GFX9-NEXT: s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GFX9-NEXT: s_cselect_b32 s0, s2, s3
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
+; GFX9-NEXT: global_store_dword v0, v2, s[8:9]
+; GFX9-NEXT: global_store_byte v0, v1, s[10:11]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_umin_ult_i32_multi_use:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_load_dword s8, s[4:5], 0x0
-; GFX10-NEXT: s_load_dword s9, s[6:7], 0x0
+; GFX10-NEXT: s_load_dword s0, s[12:13], 0x0
+; GFX10-NEXT: s_load_dword s1, s[14:15], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_cmp_lt_u32 s8, s9
-; GFX10-NEXT: s_cselect_b32 s4, -1, 0
-; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
-; GFX10-NEXT: s_and_b32 s4, s4, exec_lo
-; GFX10-NEXT: s_cselect_b32 s4, s8, s9
-; GFX10-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-NEXT: global_store_dword v1, v2, s[0:1]
-; GFX10-NEXT: global_store_byte v1, v0, s[2:3]
+; GFX10-NEXT: s_cmp_lt_u32 s0, s1
+; GFX10-NEXT: s_cselect_b32 s2, -1, 0
+; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX10-NEXT: s_and_b32 s2, s2, exec_lo
+; GFX10-NEXT: s_cselect_b32 s0, s0, s1
+; GFX10-NEXT: v_mov_b32_e32 v2, s0
+; GFX10-NEXT: global_store_dword v1, v2, s[8:9]
+; GFX10-NEXT: global_store_byte v1, v0, s[10:11]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_umin_ult_i32_multi_use:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x0
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s4, s[4:5], 0x0
-; GFX11-NEXT: s_load_b32 s5, s[6:7], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[8:9], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[10:11], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lt_u32 s4, s5
-; GFX11-NEXT: s_cselect_b32 s6, -1, 0
+; GFX11-NEXT: s_cmp_lt_u32 s0, s1
+; GFX11-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s6
-; GFX11-NEXT: s_and_b32 s6, s6, exec_lo
-; GFX11-NEXT: s_cselect_b32 s4, s4, s5
-; GFX11-NEXT: v_mov_b32_e32 v2, s4
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX11-NEXT: s_and_b32 s2, s2, exec_lo
+; GFX11-NEXT: s_cselect_b32 s0, s0, s1
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b32 v1, v2, s[0:1]
-; GFX11-NEXT: global_store_b8 v1, v0, s[2:3]
+; GFX11-NEXT: global_store_b32 v1, v2, s[4:5]
+; GFX11-NEXT: global_store_b8 v1, v0, s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2629,18 +2629,18 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
;
; VI-LABEL: v_test_umin_ult_i16_multi_use:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0
+; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v0, s12
+; VI-NEXT: v_mov_b32_e32 v1, s13
+; VI-NEXT: v_mov_b32_e32 v2, s14
+; VI-NEXT: v_mov_b32_e32 v3, s15
; VI-NEXT: flat_load_ushort v4, v[0:1]
; VI-NEXT: flat_load_ushort v5, v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_lt_u32_e32 vcc, v4, v5
; VI-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
@@ -2651,50 +2651,50 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
;
; GFX9-LABEL: v_test_umin_ult_i16_multi_use:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0
+; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[4:5]
-; GFX9-NEXT: global_load_ushort v2, v0, s[6:7]
+; GFX9-NEXT: global_load_ushort v1, v0, s[12:13]
+; GFX9-NEXT: global_load_ushort v2, v0, s[14:15]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2
; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[8:9]
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
+; GFX9-NEXT: global_store_byte v0, v1, s[10:11]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_umin_ult_i16_multi_use:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0
+; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_ushort v1, v0, s[4:5]
-; GFX10-NEXT: global_load_ushort v2, v0, s[6:7]
+; GFX10-NEXT: global_load_ushort v1, v0, s[12:13]
+; GFX10-NEXT: global_load_ushort v2, v0, s[14:15]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cmp_lt_u32_e32 vcc_lo, v1, v2
; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
-; GFX10-NEXT: global_store_byte v0, v2, s[2:3]
+; GFX10-NEXT: global_store_short v0, v1, s[8:9]
+; GFX10-NEXT: global_store_byte v0, v2, s[10:11]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_umin_ult_i16_multi_use:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x0
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_u16 v1, v0, s[4:5]
-; GFX11-NEXT: global_load_u16 v2, v0, s[6:7]
+; GFX11-NEXT: global_load_u16 v1, v0, s[8:9]
+; GFX11-NEXT: global_load_u16 v2, v0, s[10:11]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_lt_u32_e32 vcc_lo, v1, v2
; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
+; GFX11-NEXT: global_store_b8 v0, v2, s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll
index 99120ab4a1424..70082e9600510 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-addsubu64.ll
@@ -7,16 +7,16 @@ define amdgpu_kernel void @add_reg_imm(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 28744523
; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1395630315
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; CHECK-NEXT: [[V_ADD_U:%[0-9]+]]:vreg_64 = V_ADD_U64_PSEUDO [[GLOBAL_LOAD_DWORDX2_SADDR]], killed [[REG_SEQUENCE]], implicit-def $vcc_lo, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[V_ADD_U]]
- ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%lhs = load volatile i64, ptr addrspace(1) %ptr
%res = add i64 %lhs, 123456789123456789
@@ -30,15 +30,15 @@ define amdgpu_kernel void @add_reg_reg(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1)
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile load (s64) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile load (s64) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; CHECK-NEXT: [[V_ADD_U:%[0-9]+]]:vreg_64 = V_ADD_U64_PSEUDO [[GLOBAL_LOAD_DWORDX2_SADDR]], [[GLOBAL_LOAD_DWORDX2_SADDR1]], implicit-def $vcc_lo, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[V_ADD_U]]
- ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%lhs = load volatile i64, ptr addrspace(1) %ptr
%rhs = load volatile i64, ptr addrspace(1) %ptr
@@ -53,16 +53,16 @@ define amdgpu_kernel void @sub_reg_imm(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -28744524
; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1395630315
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; CHECK-NEXT: [[V_ADD_U:%[0-9]+]]:vreg_64 = V_ADD_U64_PSEUDO [[GLOBAL_LOAD_DWORDX2_SADDR]], killed [[REG_SEQUENCE]], implicit-def $vcc_lo, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[V_ADD_U]]
- ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%lhs = load volatile i64, ptr addrspace(1) %ptr
%res = sub i64 %lhs, 123456789123456789
@@ -76,16 +76,16 @@ define amdgpu_kernel void @sub_imm_reg(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 28744523
; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1395630315
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; CHECK-NEXT: [[V_SUB_U:%[0-9]+]]:vreg_64 = V_SUB_U64_PSEUDO killed [[REG_SEQUENCE]], [[GLOBAL_LOAD_DWORDX2_SADDR]], implicit-def $vcc_lo, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[V_SUB_U]]
- ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%rhs = load volatile i64, ptr addrspace(1) %ptr
%res = sub i64 123456789123456789, %rhs
@@ -99,15 +99,15 @@ define amdgpu_kernel void @sub_reg_reg(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1)
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile load (s64) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s64) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile load (s64) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; CHECK-NEXT: [[V_SUB_U:%[0-9]+]]:vreg_64 = V_SUB_U64_PSEUDO [[GLOBAL_LOAD_DWORDX2_SADDR]], [[GLOBAL_LOAD_DWORDX2_SADDR1]], implicit-def $vcc_lo, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[V_SUB_U]]
- ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORDX2_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s64) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%lhs = load volatile i64, ptr addrspace(1) %ptr
%rhs = load volatile i64, ptr addrspace(1) %ptr
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll
index 1cd9afef13b5e..db33ee87770e7 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll
@@ -7,13 +7,13 @@ define amdgpu_kernel void @exp_f32(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_EXP_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F32_e64 0, [[GLOBAL_LOAD_DWORD_SADDR]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_EXP_F32_e64_]]
- ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%val = load volatile float, ptr addrspace(1) %ptr
%res = call float @llvm.amdgcn.exp2.f32(float %val)
@@ -27,14 +27,14 @@ define amdgpu_kernel void @exp_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_EXP_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_EXP_F16_fake16_e64_]]
- ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%val = load volatile half, ptr addrspace(1) %ptr
%res = call half @llvm.amdgcn.exp2.f16(half %val)
@@ -48,13 +48,13 @@ define amdgpu_kernel void @log_f32(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_LOG_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F32_e64 0, [[GLOBAL_LOAD_DWORD_SADDR]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_LOG_F32_e64_]]
- ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%val = load volatile float, ptr addrspace(1) %ptr
%res = call float @llvm.amdgcn.log.f32(float %val)
@@ -68,14 +68,14 @@ define amdgpu_kernel void @log_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_LOG_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_LOG_F16_fake16_e64_]]
- ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%val = load volatile half, ptr addrspace(1) %ptr
%res = call half @llvm.amdgcn.log.f16(half %val)
@@ -89,13 +89,13 @@ define amdgpu_kernel void @rcp_f32(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_RCP_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F32_e64 0, [[GLOBAL_LOAD_DWORD_SADDR]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RCP_F32_e64_]]
- ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%val = load volatile float, ptr addrspace(1) %ptr
%res = call float @llvm.amdgcn.rcp.f32(float %val)
@@ -109,14 +109,14 @@ define amdgpu_kernel void @rcp_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_RCP_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RCP_F16_fake16_e64_]]
- ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%val = load volatile half, ptr addrspace(1) %ptr
%res = call half @llvm.amdgcn.rcp.f16(half %val)
@@ -130,13 +130,13 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_RSQ_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RSQ_F32_e64 0, [[GLOBAL_LOAD_DWORD_SADDR]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RSQ_F32_e64_]]
- ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%val = load volatile float, ptr addrspace(1) %ptr
%res = call float @llvm.amdgcn.rsq.f32(float %val)
@@ -150,14 +150,14 @@ define amdgpu_kernel void @rsq_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_RSQ_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RSQ_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RSQ_F16_fake16_e64_]]
- ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%val = load volatile half, ptr addrspace(1) %ptr
%res = call half @llvm.amdgcn.rsq.f16(half %val)
@@ -171,13 +171,13 @@ define amdgpu_kernel void @sqrt_f32(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s32) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_SQRT_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_SQRT_F32_e64 0, [[GLOBAL_LOAD_DWORD_SADDR]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_SQRT_F32_e64_]]
- ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s32) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%val = load volatile float, ptr addrspace(1) %ptr
%res = call float @llvm.amdgcn.sqrt.f32(float %val)
@@ -191,14 +191,14 @@ define amdgpu_kernel void @sqrt_f16(ptr addrspace(1) %ptr) {
; CHECK-NEXT: liveins: $sgpr0_sgpr1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %2:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
- ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR %2, [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
; CHECK-NEXT: [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
; CHECK-NEXT: [[V_SQRT_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_SQRT_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
; CHECK-NEXT: [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_SQRT_F16_fake16_e64_]]
- ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+ ; CHECK-NEXT: GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], %2, 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
; CHECK-NEXT: S_ENDPGM 0
%val = load volatile half, ptr addrspace(1) %ptr
%res = call half @llvm.amdgcn.sqrt.f16(half %val)
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index 3c60153df441e..c0e0b50cb6a8a 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -31,99 +31,99 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: test_mul_v2i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_lo_u32 v1, v1, v3
; VI-NEXT: v_mul_lo_u32 v0, v0, v2
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_mul_v2i32:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v1, v1, v3
; GFX9-NEXT: v_mul_lo_u32 v0, v0, v2
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: test_mul_v2i32:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s10, s6
-; GFX10-NEXT: s_mov_b32 s11, s7
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s10, s2
+; GFX10-NEXT: s_mov_b32 s11, s3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
-; GFX10-NEXT: s_mov_b32 s4, s0
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s0, s4
; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_mov_b32 s1, s5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_lo_u32 v1, v1, v3
; GFX10-NEXT: v_mul_lo_u32 v0, v0, v2
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: test_mul_v2i32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_lo_u32 v1, v1, v3
; GFX11-NEXT: v_mul_lo_u32 v0, v0, v2
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: test_mul_v2i32:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: s_mov_b32 s6, -1
-; GFX12-NEXT: s_mov_b32 s7, 0x31016000
-; GFX12-NEXT: s_mov_b32 s10, s6
-; GFX12-NEXT: s_mov_b32 s11, s7
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s10, s2
+; GFX12-NEXT: s_mov_b32 s11, s3
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s8, s2
-; GFX12-NEXT: s_mov_b32 s9, s3
-; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s8, s6
+; GFX12-NEXT: s_mov_b32 s9, s7
+; GFX12-NEXT: s_mov_b32 s0, s4
; GFX12-NEXT: buffer_load_b128 v[0:3], off, s[8:11], null
-; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_mov_b32 s1, s5
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mul_lo_u32 v1, v1, v3
; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2
-; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
+; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -179,117 +179,117 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; VI-LABEL: v_mul_v4i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_lo_u32 v3, v3, v7
; VI-NEXT: v_mul_lo_u32 v2, v2, v6
; VI-NEXT: v_mul_lo_u32 v1, v1, v5
; VI-NEXT: v_mul_lo_u32 v0, v0, v4
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_mul_v4i32:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v3, v3, v7
; GFX9-NEXT: v_mul_lo_u32 v2, v2, v6
; GFX9-NEXT: v_mul_lo_u32 v1, v1, v5
; GFX9-NEXT: v_mul_lo_u32 v0, v0, v4
-; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_mul_v4i32:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s10, s6
-; GFX10-NEXT: s_mov_b32 s11, s7
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s10, s2
+; GFX10-NEXT: s_mov_b32 s11, s3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GFX10-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; GFX10-NEXT: s_mov_b32 s4, s0
-; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_mov_b32 s0, s4
+; GFX10-NEXT: s_mov_b32 s1, s5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_lo_u32 v3, v3, v7
; GFX10-NEXT: v_mul_lo_u32 v2, v2, v6
; GFX10-NEXT: v_mul_lo_u32 v1, v1, v5
; GFX10-NEXT: v_mul_lo_u32 v0, v0, v4
-; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul_v4i32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[8:11], 0
; GFX11-NEXT: buffer_load_b128 v[4:7], off, s[8:11], 0 offset:16
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_lo_u32 v3, v3, v7
; GFX11-NEXT: v_mul_lo_u32 v2, v2, v6
; GFX11-NEXT: v_mul_lo_u32 v1, v1, v5
; GFX11-NEXT: v_mul_lo_u32 v0, v0, v4
-; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_mul_v4i32:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: s_mov_b32 s6, -1
-; GFX12-NEXT: s_mov_b32 s7, 0x31016000
-; GFX12-NEXT: s_mov_b32 s10, s6
-; GFX12-NEXT: s_mov_b32 s11, s7
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s10, s2
+; GFX12-NEXT: s_mov_b32 s11, s3
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s8, s2
-; GFX12-NEXT: s_mov_b32 s9, s3
+; GFX12-NEXT: s_mov_b32 s8, s6
+; GFX12-NEXT: s_mov_b32 s9, s7
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: buffer_load_b128 v[0:3], off, s[8:11], null
; GFX12-NEXT: buffer_load_b128 v[4:7], off, s[8:11], null offset:16
-; GFX12-NEXT: s_mov_b32 s4, s0
-; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_mov_b32 s0, s4
+; GFX12-NEXT: s_mov_b32 s1, s5
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mul_lo_u32 v3, v3, v7
; GFX12-NEXT: v_mul_lo_u32 v2, v2, v6
; GFX12-NEXT: v_mul_lo_u32 v1, v1, v5
; GFX12-NEXT: v_mul_lo_u32 v0, v0, v4
-; GFX12-NEXT: buffer_store_b128 v[0:3], off, s[4:7], null
+; GFX12-NEXT: buffer_store_b128 v[0:3], off, s[0:3], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -524,23 +524,23 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0
-; GFX11-NEXT: buffer_load_b32 v1, off, s[0:3], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: buffer_load_b32 v1, off, s[8:11], 0
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_lo_u32 v0, v1, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -549,23 +549,23 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT: s_mov_b32 s10, -1
-; GFX12-NEXT: s_mov_b32 s11, 0x31016000
-; GFX12-NEXT: s_mov_b32 s14, s10
-; GFX12-NEXT: s_mov_b32 s15, s11
-; GFX12-NEXT: s_mov_b32 s2, s10
-; GFX12-NEXT: s_mov_b32 s3, s11
+; GFX12-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s14, s2
+; GFX12-NEXT: s_mov_b32 s15, s3
+; GFX12-NEXT: s_mov_b32 s10, s2
+; GFX12-NEXT: s_mov_b32 s11, s3
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s12, s6
; GFX12-NEXT: s_mov_b32 s13, s7
; GFX12-NEXT: buffer_load_b32 v0, off, s[12:15], null
-; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], null
-; GFX12-NEXT: s_mov_b32 s8, s4
-; GFX12-NEXT: s_mov_b32 s9, s5
+; GFX12-NEXT: buffer_load_b32 v1, off, s[8:11], null
+; GFX12-NEXT: s_mov_b32 s0, s4
+; GFX12-NEXT: s_mov_b32 s1, s5
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mul_lo_u32 v0, v1, v0
-; GFX12-NEXT: buffer_store_b32 v0, off, s[8:11], null
+; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -618,14 +618,13 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) {
; VI-LABEL: mul64_sext_c:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x50
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], s2, v0, 0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: s_nop 2
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: v_mad_i64_i32 v[0:1], s[0:1], s2, v0, 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: mul64_sext_c:
@@ -661,15 +660,15 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) {
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mul_i32 s3, s2, 0x50
-; GFX11-NEXT: s_mul_hi_i32 s2, s2, 0x50
+; GFX11-NEXT: s_mul_i32 s0, s2, 0x50
+; GFX11-NEXT: s_mul_hi_i32 s1, s2, 0x50
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -729,14 +728,13 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) {
; VI-LABEL: mul64_zext_c:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x50
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v0, 0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: s_nop 2
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s2, v0, 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: mul64_zext_c:
@@ -772,15 +770,15 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) {
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mul_i32 s3, s2, 0x50
-; GFX11-NEXT: s_mul_hi_u32 s2, s2, 0x50
+; GFX11-NEXT: s_mul_i32 s0, s2, 0x50
+; GFX11-NEXT: s_mul_hi_u32 s1, s2, 0x50
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -845,100 +843,101 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: v_mul64_sext_c:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_movk_i32 s2, 0x50
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_movk_i32 s0, 0x50
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], v0, s2, 0
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: v_mad_i64_i32 v[0:1], s[0:1], v0, s0, 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_nop 2
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_mul64_sext_c:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT: s_movk_i32 s2, 0x50
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_movk_i32 s0, 0x50
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mul_hi_i32 v1, v0, s2
-; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: v_mul_hi_i32 v1, v0, s0
+; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_mul64_sext_c:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s10, s6
-; GFX10-NEXT: s_mov_b32 s11, s7
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s10, s2
+; GFX10-NEXT: s_mov_b32 s11, s3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
-; GFX10-NEXT: s_mov_b32 s4, s0
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s0, s4
; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_mov_b32 s1, s5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_hi_i32 v1, 0x50, v0
; GFX10-NEXT: v_mul_lo_u32 v0, 0x50, v0
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul64_sext_c:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_hi_i32 v1, 0x50, v0
; GFX11-NEXT: v_mul_lo_u32 v0, 0x50, v0
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_mul64_sext_c:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: s_mov_b32 s6, -1
-; GFX12-NEXT: s_mov_b32 s7, 0x31016000
-; GFX12-NEXT: s_mov_b32 s10, s6
-; GFX12-NEXT: s_mov_b32 s11, s7
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s10, s2
+; GFX12-NEXT: s_mov_b32 s11, s3
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s8, s2
-; GFX12-NEXT: s_mov_b32 s9, s3
-; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s8, s6
+; GFX12-NEXT: s_mov_b32 s9, s7
+; GFX12-NEXT: s_mov_b32 s0, s4
; GFX12-NEXT: buffer_load_b32 v0, off, s[8:11], null
-; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_mov_b32 s1, s5
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mul_hi_i32 v1, 0x50, v0
; GFX12-NEXT: v_mul_lo_u32 v0, 0x50, v0
-; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
+; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -992,100 +991,101 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: v_mul64_zext_c:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_movk_i32 s2, 0x50
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_movk_i32 s0, 0x50
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, s2, 0
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, s0, 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_nop 2
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_mul64_zext_c:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT: s_movk_i32 s2, 0x50
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_movk_i32 s0, 0x50
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mul_hi_u32 v1, v0, s2
-; GFX9-NEXT: v_mul_lo_u32 v0, v0, s2
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: v_mul_hi_u32 v1, v0, s0
+; GFX9-NEXT: v_mul_lo_u32 v0, v0, s0
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_mul64_zext_c:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s10, s6
-; GFX10-NEXT: s_mov_b32 s11, s7
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s10, s2
+; GFX10-NEXT: s_mov_b32 s11, s3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
-; GFX10-NEXT: s_mov_b32 s4, s0
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s0, s4
; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_mov_b32 s1, s5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_hi_u32 v1, 0x50, v0
; GFX10-NEXT: v_mul_lo_u32 v0, 0x50, v0
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul64_zext_c:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_hi_u32 v1, 0x50, v0
; GFX11-NEXT: v_mul_lo_u32 v0, 0x50, v0
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_mul64_zext_c:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: s_mov_b32 s6, -1
-; GFX12-NEXT: s_mov_b32 s7, 0x31016000
-; GFX12-NEXT: s_mov_b32 s10, s6
-; GFX12-NEXT: s_mov_b32 s11, s7
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s10, s2
+; GFX12-NEXT: s_mov_b32 s11, s3
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s8, s2
-; GFX12-NEXT: s_mov_b32 s9, s3
-; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s8, s6
+; GFX12-NEXT: s_mov_b32 s9, s7
+; GFX12-NEXT: s_mov_b32 s0, s4
; GFX12-NEXT: buffer_load_b32 v0, off, s[8:11], null
-; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_mov_b32 s1, s5
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mul_hi_u32 v1, 0x50, v0
; GFX12-NEXT: v_mul_lo_u32 v0, 0x50, v0
-; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
+; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1138,98 +1138,99 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad
;
; VI-LABEL: v_mul64_sext_inline_imm:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mad_i64_i32 v[0:1], s[2:3], v0, 9, 0
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: v_mad_i64_i32 v[0:1], s[0:1], v0, 9, 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_nop 2
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_mul64_sext_inline_imm:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_hi_i32 v1, v0, 9
; GFX9-NEXT: v_mul_lo_u32 v0, v0, 9
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_mul64_sext_inline_imm:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s10, s6
-; GFX10-NEXT: s_mov_b32 s11, s7
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s10, s2
+; GFX10-NEXT: s_mov_b32 s11, s3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
-; GFX10-NEXT: s_mov_b32 s4, s0
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s0, s4
; GFX10-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_mov_b32 s1, s5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_hi_i32 v1, v0, 9
; GFX10-NEXT: v_mul_lo_u32 v0, v0, 9
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul64_sext_inline_imm:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_hi_i32 v1, v0, 9
; GFX11-NEXT: v_mul_lo_u32 v0, v0, 9
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_mul64_sext_inline_imm:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: s_mov_b32 s6, -1
-; GFX12-NEXT: s_mov_b32 s7, 0x31016000
-; GFX12-NEXT: s_mov_b32 s10, s6
-; GFX12-NEXT: s_mov_b32 s11, s7
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s10, s2
+; GFX12-NEXT: s_mov_b32 s11, s3
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s8, s2
-; GFX12-NEXT: s_mov_b32 s9, s3
-; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s8, s6
+; GFX12-NEXT: s_mov_b32 s9, s7
+; GFX12-NEXT: s_mov_b32 s0, s4
; GFX12-NEXT: buffer_load_b32 v0, off, s[8:11], null
-; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_mov_b32 s1, s5
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mul_hi_i32 v1, 9, v0
; GFX12-NEXT: v_mul_lo_u32 v0, 9, v0
-; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
+; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1276,15 +1277,15 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [
;
; VI-LABEL: s_mul_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x4c
-; VI-NEXT: s_load_dword s5, s[0:1], 0x70
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x4c
+; VI-NEXT: s_load_dword s3, s[0:1], 0x70
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mul_i32 s4, s4, s5
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_mul_i32 s0, s2, s3
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_mul_i32:
@@ -1319,13 +1320,14 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c
; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mul_i32 s2, s2, s3
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: s_mul_i32 s0, s2, s3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1335,13 +1337,14 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [
; GFX12-NEXT: s_clause 0x2
; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x4c
; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x70
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-NEXT: s_mov_b32 s6, -1
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mul_i32 s2, s2, s3
-; GFX12-NEXT: s_mov_b32 s3, 0x31016000
-; GFX12-NEXT: v_mov_b32_e32 v0, s2
-; GFX12-NEXT: s_mov_b32 s2, -1
-; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX12-NEXT: s_mul_i32 s0, s2, s3
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1383,94 +1386,94 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
;
; VI-LABEL: v_mul_i32:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_lo_u32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_mul_i32:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_lo_u32 v0, v0, v1
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_mul_i32:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s10, s6
-; GFX10-NEXT: s_mov_b32 s11, s7
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s10, s2
+; GFX10-NEXT: s_mov_b32 s11, s3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
-; GFX10-NEXT: s_mov_b32 s4, s0
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
+; GFX10-NEXT: s_mov_b32 s0, s4
; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_mov_b32 s1, s5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_lo_u32 v0, v0, v1
-; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul_i32:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_lo_u32 v0, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_mul_i32:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: s_mov_b32 s6, -1
-; GFX12-NEXT: s_mov_b32 s7, 0x31016000
-; GFX12-NEXT: s_mov_b32 s10, s6
-; GFX12-NEXT: s_mov_b32 s11, s7
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s10, s2
+; GFX12-NEXT: s_mov_b32 s11, s3
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s8, s2
-; GFX12-NEXT: s_mov_b32 s9, s3
-; GFX12-NEXT: s_mov_b32 s4, s0
+; GFX12-NEXT: s_mov_b32 s8, s6
+; GFX12-NEXT: s_mov_b32 s9, s7
+; GFX12-NEXT: s_mov_b32 s0, s4
; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[8:11], null
-; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_mov_b32 s1, s5
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mul_lo_u32 v0, v0, v1
-; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null
+; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1517,16 +1520,16 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8
;
; VI-LABEL: s_mul_i1:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x70
-; VI-NEXT: s_load_dword s5, s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x70
+; VI-NEXT: s_load_dword s3, s[0:1], 0x4c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mul_lo_u16_e32 v0, s5, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mul_lo_u16_e32 v0, s3, v0
; VI-NEXT: v_and_b32_e32 v0, 1, v0
-; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_mul_i1:
@@ -1562,14 +1565,14 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x4c
; GFX11-NEXT: s_load_b32 s3, s[0:1], 0x70
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mul_lo_u16 v0, s2, s3
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b8 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1579,14 +1582,14 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8
; GFX12-NEXT: s_clause 0x2
; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x4c
; GFX12-NEXT: s_load_b32 s3, s[0:1], 0x70
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-NEXT: s_mov_b32 s6, -1
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_lo_u16 v0, s2, s3
-; GFX12-NEXT: s_mov_b32 s3, 0x31016000
-; GFX12-NEXT: s_mov_b32 s2, -1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX12-NEXT: buffer_store_b8 v0, off, s[0:3], null
+; GFX12-NEXT: buffer_store_b8 v0, off, s[4:7], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1647,109 +1650,109 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in)
;
; VI-LABEL: v_mul_i1:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_lo_u16_e32 v0, v0, v1
; VI-NEXT: v_and_b32_e32 v0, 1, v0
-; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_mul_i1:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
; GFX9-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_mul_i1:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s10, s6
-; GFX10-NEXT: s_mov_b32 s11, s7
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s10, s2
+; GFX10-NEXT: s_mov_b32 s11, s3
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s8, s2
-; GFX10-NEXT: s_mov_b32 s9, s3
+; GFX10-NEXT: s_mov_b32 s8, s6
+; GFX10-NEXT: s_mov_b32 s9, s7
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
; GFX10-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:4
-; GFX10-NEXT: s_mov_b32 s4, s0
-; GFX10-NEXT: s_mov_b32 s5, s1
+; GFX10-NEXT: s_mov_b32 s0, s4
+; GFX10-NEXT: s_mov_b32 s1, s5
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX10-NEXT: buffer_store_byte v0, off, s[4:7], 0
+; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul_i1:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: buffer_load_u8 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_u8 v1, off, s[8:11], 0 offset:4
-; GFX11-NEXT: s_mov_b32 s4, s0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: buffer_store_b8 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b8 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_mul_i1:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: s_mov_b32 s6, -1
-; GFX12-NEXT: s_mov_b32 s7, 0x31016000
-; GFX12-NEXT: s_mov_b32 s10, s6
-; GFX12-NEXT: s_mov_b32 s11, s7
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s10, s2
+; GFX12-NEXT: s_mov_b32 s11, s3
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s8, s2
-; GFX12-NEXT: s_mov_b32 s9, s3
+; GFX12-NEXT: s_mov_b32 s8, s6
+; GFX12-NEXT: s_mov_b32 s9, s7
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: buffer_load_u8 v0, off, s[8:11], null
; GFX12-NEXT: buffer_load_u8 v1, off, s[8:11], null offset:4
-; GFX12-NEXT: s_mov_b32 s4, s0
-; GFX12-NEXT: s_mov_b32 s5, s1
+; GFX12-NEXT: s_mov_b32 s0, s4
+; GFX12-NEXT: s_mov_b32 s1, s5
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mul_lo_u16 v0, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX12-NEXT: buffer_store_b8 v0, off, s[4:7], null
+; GFX12-NEXT: buffer_store_b8 v0, off, s[0:3], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -1881,17 +1884,17 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mul_i32 s1, s6, s1
-; GFX11-NEXT: s_mul_hi_u32 s2, s6, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_add_i32 s1, s2, s1
-; GFX11-NEXT: s_mul_i32 s2, s7, s0
-; GFX11-NEXT: s_mul_i32 s0, s6, s0
-; GFX11-NEXT: s_add_i32 s1, s1, s2
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: s_mul_i32 s0, s6, s3
+; GFX11-NEXT: s_mul_hi_u32 s1, s6, s2
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_add_i32 s0, s1, s0
+; GFX11-NEXT: s_mul_i32 s1, s7, s2
+; GFX11-NEXT: s_mul_i32 s2, s6, s2
+; GFX11-NEXT: s_add_i32 s0, s0, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s0
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: s_mov_b32 s1, s5
@@ -1904,9 +1907,9 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mul_u64 s[0:1], s[6:7], s[0:1]
+; GFX12-NEXT: s_mul_u64 s[0:1], s[6:7], s[2:3]
; GFX12-NEXT: s_mov_b32 s7, 0x31016000
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: s_mov_b32 s6, -1
@@ -2049,20 +2052,20 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
; GFX11-NEXT: buffer_load_b64 v[2:3], off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_lo_u32 v1, v2, v1
; GFX11-NEXT: v_mul_hi_u32 v4, v2, v0
@@ -2071,7 +2074,7 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_nc_u32_e32 v1, v4, v1
; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v3
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2080,20 +2083,20 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX12-NEXT: s_mov_b32 s10, -1
-; GFX12-NEXT: s_mov_b32 s11, 0x31016000
-; GFX12-NEXT: s_mov_b32 s2, s10
-; GFX12-NEXT: s_mov_b32 s3, s11
-; GFX12-NEXT: s_mov_b32 s14, s10
-; GFX12-NEXT: s_mov_b32 s15, s11
+; GFX12-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s10, s2
+; GFX12-NEXT: s_mov_b32 s11, s3
+; GFX12-NEXT: s_mov_b32 s14, s2
+; GFX12-NEXT: s_mov_b32 s15, s3
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_mov_b32 s12, s6
; GFX12-NEXT: s_mov_b32 s13, s7
-; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[0:3], null
+; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[8:11], null
; GFX12-NEXT: buffer_load_b64 v[2:3], off, s[12:15], null
-; GFX12-NEXT: s_mov_b32 s8, s4
-; GFX12-NEXT: s_mov_b32 s9, s5
+; GFX12-NEXT: s_mov_b32 s0, s4
+; GFX12-NEXT: s_mov_b32 s1, s5
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mul_lo_u32 v3, v0, v3
; GFX12-NEXT: v_mul_lo_u32 v1, v1, v2
@@ -2102,7 +2105,7 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_add_nc_u32_e32 v1, v3, v1
; GFX12-NEXT: v_add_nc_u32_e32 v1, v1, v4
-; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[8:11], null
+; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2181,32 +2184,32 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cbranch_scc0 .LBB15_2
; VI-NEXT: ; %bb.1: ; %else
-; VI-NEXT: s_mul_i32 s6, s2, s3
-; VI-NEXT: s_mov_b64 s[4:5], 0
+; VI-NEXT: s_mul_i32 s8, s2, s3
+; VI-NEXT: s_mov_b64 s[2:3], 0
; VI-NEXT: s_branch .LBB15_3
; VI-NEXT: .LBB15_2:
-; VI-NEXT: s_mov_b64 s[4:5], -1
-; VI-NEXT: ; implicit-def: $sgpr6
+; VI-NEXT: s_mov_b64 s[2:3], -1
+; VI-NEXT: ; implicit-def: $sgpr8
; VI-NEXT: .LBB15_3: ; %Flow
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; VI-NEXT: s_cbranch_vccnz .LBB15_5
; VI-NEXT: ; %bb.4: ; %if
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0
; VI-NEXT: s_branch .LBB15_6
; VI-NEXT: .LBB15_5:
-; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: .LBB15_6: ; %endif
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: mul32_in_branch:
@@ -2216,102 +2219,102 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX9-NEXT: s_cmp_lg_u32 s2, 0
; GFX9-NEXT: s_cbranch_scc0 .LBB15_2
; GFX9-NEXT: ; %bb.1: ; %else
-; GFX9-NEXT: s_mul_i32 s6, s2, s3
-; GFX9-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-NEXT: s_mul_i32 s8, s2, s3
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: s_branch .LBB15_3
; GFX9-NEXT: .LBB15_2:
-; GFX9-NEXT: s_mov_b64 s[4:5], -1
-; GFX9-NEXT: ; implicit-def: $sgpr6
+; GFX9-NEXT: s_mov_b64 s[2:3], -1
+; GFX9-NEXT: ; implicit-def: $sgpr8
; GFX9-NEXT: .LBB15_3: ; %Flow
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5]
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[2:3]
; GFX9-NEXT: s_cbranch_vccnz .LBB15_5
; GFX9-NEXT: ; %bb.4: ; %if
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s4, s2
-; GFX9-NEXT: s_mov_b32 s5, s3
-; GFX9-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; GFX9-NEXT: s_mov_b32 s0, s6
+; GFX9-NEXT: s_mov_b32 s1, s7
+; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_branch .LBB15_6
; GFX9-NEXT: .LBB15_5:
-; GFX9-NEXT: v_mov_b32_e32 v0, s6
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: .LBB15_6: ; %endif
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: mul32_in_branch:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: s_mov_b32 s8, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_cmp_lg_u32 s2, 0
; GFX10-NEXT: s_cbranch_scc0 .LBB15_2
; GFX10-NEXT: ; %bb.1: ; %else
-; GFX10-NEXT: s_mul_i32 s5, s2, s3
+; GFX10-NEXT: s_mul_i32 s2, s2, s3
; GFX10-NEXT: s_branch .LBB15_3
; GFX10-NEXT: .LBB15_2:
-; GFX10-NEXT: s_mov_b32 s4, -1
-; GFX10-NEXT: ; implicit-def: $sgpr5
+; GFX10-NEXT: s_mov_b32 s8, -1
+; GFX10-NEXT: ; implicit-def: $sgpr2
; GFX10-NEXT: .LBB15_3: ; %Flow
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT: s_andn2_b32 vcc_lo, exec_lo, s8
; GFX10-NEXT: s_cbranch_vccnz .LBB15_5
; GFX10-NEXT: ; %bb.4: ; %if
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s6, -1
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s2, -1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s4, s2
-; GFX10-NEXT: s_mov_b32 s5, s3
-; GFX10-NEXT: buffer_load_dword v0, off, s[4:7], 0
+; GFX10-NEXT: s_mov_b32 s0, s6
+; GFX10-NEXT: s_mov_b32 s1, s7
+; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], 0
; GFX10-NEXT: s_branch .LBB15_6
; GFX10-NEXT: .LBB15_5:
-; GFX10-NEXT: v_mov_b32_e32 v0, s5
+; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: .LBB15_6: ; %endif
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: mul32_in_branch:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: s_mov_b32 s8, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-NEXT: s_cbranch_scc0 .LBB15_2
; GFX11-NEXT: ; %bb.1: ; %else
-; GFX11-NEXT: s_mul_i32 s5, s2, s3
+; GFX11-NEXT: s_mul_i32 s2, s2, s3
; GFX11-NEXT: s_branch .LBB15_3
; GFX11-NEXT: .LBB15_2:
-; GFX11-NEXT: s_mov_b32 s4, -1
-; GFX11-NEXT: ; implicit-def: $sgpr5
+; GFX11-NEXT: s_mov_b32 s8, -1
+; GFX11-NEXT: ; implicit-def: $sgpr2
; GFX11-NEXT: .LBB15_3: ; %Flow
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
; GFX11-NEXT: s_cbranch_vccnz .LBB15_5
; GFX11-NEXT: ; %bb.4: ; %if
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s6, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s4, s2
-; GFX11-NEXT: s_mov_b32 s5, s3
-; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: s_mov_b32 s0, s6
+; GFX11-NEXT: s_mov_b32 s1, s7
+; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_branch .LBB15_6
; GFX11-NEXT: .LBB15_5:
-; GFX11-NEXT: v_mov_b32_e32 v0, s5
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
; GFX11-NEXT: .LBB15_6: ; %endif
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2319,36 +2322,36 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX12-LABEL: mul32_in_branch:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
-; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: s_mov_b32 s8, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_cmp_lg_u32 s2, 0
; GFX12-NEXT: s_cbranch_scc0 .LBB15_2
; GFX12-NEXT: ; %bb.1: ; %else
-; GFX12-NEXT: s_mul_i32 s5, s2, s3
+; GFX12-NEXT: s_mul_i32 s2, s2, s3
; GFX12-NEXT: s_branch .LBB15_3
; GFX12-NEXT: .LBB15_2:
-; GFX12-NEXT: s_mov_b32 s4, -1
-; GFX12-NEXT: ; implicit-def: $sgpr5
+; GFX12-NEXT: s_mov_b32 s8, -1
+; GFX12-NEXT: ; implicit-def: $sgpr2
; GFX12-NEXT: .LBB15_3: ; %Flow
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s8
; GFX12-NEXT: s_cbranch_vccnz .LBB15_5
; GFX12-NEXT: ; %bb.4: ; %if
-; GFX12-NEXT: s_mov_b32 s7, 0x31016000
-; GFX12-NEXT: s_mov_b32 s6, -1
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s4, s2
-; GFX12-NEXT: s_mov_b32 s5, s3
-; GFX12-NEXT: buffer_load_b32 v0, off, s[4:7], null
+; GFX12-NEXT: s_mov_b32 s0, s6
+; GFX12-NEXT: s_mov_b32 s1, s7
+; GFX12-NEXT: buffer_load_b32 v0, off, s[0:3], null
; GFX12-NEXT: s_branch .LBB15_6
; GFX12-NEXT: .LBB15_5:
-; GFX12-NEXT: v_mov_b32_e32 v0, s5
+; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: .LBB15_6: ; %endif
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s3, 0x31016000
-; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-NEXT: s_mov_b32 s6, -1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null
+; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2445,31 +2448,31 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
;
; VI-LABEL: mul64_in_branch:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b64 s[8:9], 0
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_cmp_lg_u64 s[4:5], 0
+; VI-NEXT: s_cmp_lg_u64 s[8:9], 0
; VI-NEXT: s_cbranch_scc0 .LBB16_4
; VI-NEXT: ; %bb.1: ; %else
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mad_u64_u32 v[0:1], s[10:11], s4, v0, 0
-; VI-NEXT: s_mul_i32 s4, s4, s7
-; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1
-; VI-NEXT: s_mul_i32 s4, s5, s6
-; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1
-; VI-NEXT: s_andn2_b64 vcc, exec, s[8:9]
+; VI-NEXT: v_mov_b32_e32 v0, s10
+; VI-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s8, v0, 0
+; VI-NEXT: s_mul_i32 s2, s8, s11
+; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
+; VI-NEXT: s_mul_i32 s2, s9, s10
+; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v1
+; VI-NEXT: s_andn2_b64 vcc, exec, s[0:1]
; VI-NEXT: s_cbranch_vccnz .LBB16_3
; VI-NEXT: .LBB16_2: ; %if
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
-; VI-NEXT: .LBB16_3: ; %endif
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: .LBB16_3: ; %endif
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
; VI-NEXT: .LBB16_4:
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -2477,135 +2480,136 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
;
; GFX9-LABEL: mul64_in_branch:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b64 s[8:9], 0
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX9-NEXT: s_cbranch_scc0 .LBB16_3
; GFX9-NEXT: ; %bb.1: ; %else
-; GFX9-NEXT: s_mul_i32 s7, s4, s7
-; GFX9-NEXT: s_mul_hi_u32 s10, s4, s6
-; GFX9-NEXT: s_add_i32 s7, s10, s7
-; GFX9-NEXT: s_mul_i32 s5, s5, s6
-; GFX9-NEXT: s_add_i32 s5, s7, s5
-; GFX9-NEXT: s_mul_i32 s4, s4, s6
-; GFX9-NEXT: s_andn2_b64 vcc, exec, s[8:9]
+; GFX9-NEXT: s_mul_i32 s2, s8, s11
+; GFX9-NEXT: s_mul_hi_u32 s3, s8, s10
+; GFX9-NEXT: s_add_i32 s2, s3, s2
+; GFX9-NEXT: s_mul_i32 s3, s9, s10
+; GFX9-NEXT: s_add_i32 s3, s2, s3
+; GFX9-NEXT: s_mul_i32 s2, s8, s10
+; GFX9-NEXT: s_andn2_b64 vcc, exec, s[0:1]
; GFX9-NEXT: s_cbranch_vccnz .LBB16_4
; GFX9-NEXT: .LBB16_2: ; %if
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s4, s2
-; GFX9-NEXT: s_mov_b32 s5, s3
-; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s0, s6
+; GFX9-NEXT: s_mov_b32 s1, s7
+; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_branch .LBB16_5
; GFX9-NEXT: .LBB16_3:
-; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX9-NEXT: ; implicit-def: $sgpr2_sgpr3
; GFX9-NEXT: s_branch .LBB16_2
; GFX9-NEXT: .LBB16_4:
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: .LBB16_5: ; %endif
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: mul64_in_branch:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX10-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX10-NEXT: s_cbranch_scc0 .LBB16_3
; GFX10-NEXT: ; %bb.1: ; %else
-; GFX10-NEXT: s_mul_i32 s7, s4, s7
-; GFX10-NEXT: s_mul_hi_u32 s8, s4, s6
-; GFX10-NEXT: s_mul_i32 s5, s5, s6
-; GFX10-NEXT: s_add_i32 s7, s8, s7
-; GFX10-NEXT: s_mul_i32 s4, s4, s6
-; GFX10-NEXT: s_add_i32 s5, s7, s5
+; GFX10-NEXT: s_mul_i32 s0, s8, s11
+; GFX10-NEXT: s_mul_hi_u32 s1, s8, s10
+; GFX10-NEXT: s_mul_i32 s2, s9, s10
+; GFX10-NEXT: s_add_i32 s0, s1, s0
+; GFX10-NEXT: s_add_i32 s1, s0, s2
+; GFX10-NEXT: s_mul_i32 s0, s8, s10
; GFX10-NEXT: s_cbranch_execnz .LBB16_4
; GFX10-NEXT: .LBB16_2: ; %if
-; GFX10-NEXT: s_mov_b32 s7, 0x31016000
-; GFX10-NEXT: s_mov_b32 s6, -1
-; GFX10-NEXT: s_mov_b32 s4, s2
-; GFX10-NEXT: s_mov_b32 s5, s3
-; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX10-NEXT: s_mov_b32 s3, 0x31016000
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s0, s6
+; GFX10-NEXT: s_mov_b32 s1, s7
+; GFX10-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
; GFX10-NEXT: s_branch .LBB16_5
; GFX10-NEXT: .LBB16_3:
-; GFX10-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX10-NEXT: ; implicit-def: $sgpr0_sgpr1
; GFX10-NEXT: s_branch .LBB16_2
; GFX10-NEXT: .LBB16_4:
-; GFX10-NEXT: v_mov_b32_e32 v0, s4
-; GFX10-NEXT: v_mov_b32_e32 v1, s5
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: .LBB16_5: ; %endif
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: mul64_in_branch:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX11-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX11-NEXT: s_cbranch_scc0 .LBB16_3
; GFX11-NEXT: ; %bb.1: ; %else
-; GFX11-NEXT: s_mul_i32 s7, s4, s7
-; GFX11-NEXT: s_mul_hi_u32 s8, s4, s6
-; GFX11-NEXT: s_mul_i32 s5, s5, s6
-; GFX11-NEXT: s_add_i32 s7, s8, s7
-; GFX11-NEXT: s_mul_i32 s4, s4, s6
-; GFX11-NEXT: s_add_i32 s5, s7, s5
+; GFX11-NEXT: s_mul_i32 s0, s8, s11
+; GFX11-NEXT: s_mul_hi_u32 s1, s8, s10
+; GFX11-NEXT: s_mul_i32 s2, s9, s10
+; GFX11-NEXT: s_add_i32 s0, s1, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_add_i32 s1, s0, s2
+; GFX11-NEXT: s_mul_i32 s0, s8, s10
; GFX11-NEXT: s_cbranch_execnz .LBB16_4
; GFX11-NEXT: .LBB16_2: ; %if
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s4, s2
-; GFX11-NEXT: s_mov_b32 s5, s3
-; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[4:7], 0
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s0, s6
+; GFX11-NEXT: s_mov_b32 s1, s7
+; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_branch .LBB16_5
; GFX11-NEXT: .LBB16_3:
-; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX11-NEXT: ; implicit-def: $sgpr0_sgpr1
; GFX11-NEXT: s_branch .LBB16_2
; GFX11-NEXT: .LBB16_4:
-; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: .LBB16_5: ; %endif
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b64 v[0:1], off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: mul64_in_branch:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0
+; GFX12-NEXT: s_cmp_lg_u64 s[8:9], 0
; GFX12-NEXT: s_cbranch_scc0 .LBB16_3
; GFX12-NEXT: ; %bb.1: ; %else
-; GFX12-NEXT: s_mul_u64 s[4:5], s[4:5], s[6:7]
+; GFX12-NEXT: s_mul_u64 s[0:1], s[8:9], s[10:11]
; GFX12-NEXT: s_cbranch_execnz .LBB16_4
; GFX12-NEXT: .LBB16_2: ; %if
-; GFX12-NEXT: s_mov_b32 s7, 0x31016000
-; GFX12-NEXT: s_mov_b32 s6, -1
-; GFX12-NEXT: s_mov_b32 s4, s2
-; GFX12-NEXT: s_mov_b32 s5, s3
-; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[4:7], null
+; GFX12-NEXT: s_mov_b32 s3, 0x31016000
+; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s0, s6
+; GFX12-NEXT: s_mov_b32 s1, s7
+; GFX12-NEXT: buffer_load_b64 v[0:1], off, s[0:3], null
; GFX12-NEXT: s_branch .LBB16_5
; GFX12-NEXT: .LBB16_3:
-; GFX12-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX12-NEXT: ; implicit-def: $sgpr0_sgpr1
; GFX12-NEXT: s_branch .LBB16_2
; GFX12-NEXT: .LBB16_4:
-; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-NEXT: .LBB16_5: ; %endif
-; GFX12-NEXT: s_mov_b32 s3, 0x31016000
-; GFX12-NEXT: s_mov_b32 s2, -1
+; GFX12-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-NEXT: s_mov_b32 s6, -1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
+; GFX12-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -2724,41 +2728,41 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a,
;
; VI-LABEL: s_mul_i128:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x7c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x4c
+; VI-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x7c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v5, 0
-; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: v_mad_u64_u32 v[2:3], s[12:13], s8, v0, 0
-; VI-NEXT: s_mul_i32 s7, s8, s7
-; VI-NEXT: v_mov_b32_e32 v6, s8
-; VI-NEXT: v_add_u32_e32 v3, vcc, s7, v3
-; VI-NEXT: s_mul_i32 s12, s9, s6
-; VI-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s4, v6, 0
-; VI-NEXT: v_add_u32_e32 v3, vcc, s12, v3
+; VI-NEXT: v_mov_b32_e32 v0, s10
+; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s12, v0, 0
+; VI-NEXT: s_mul_i32 s0, s12, s11
+; VI-NEXT: v_mov_b32_e32 v6, s12
+; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v3
+; VI-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0
+; VI-NEXT: s_mul_i32 s2, s13, s10
+; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v3
; VI-NEXT: v_mov_b32_e32 v4, v1
-; VI-NEXT: v_mad_u64_u32 v[6:7], s[6:7], s5, v6, v[4:5]
-; VI-NEXT: v_mov_b32_e32 v8, s4
-; VI-NEXT: v_mad_u64_u32 v[1:2], s[6:7], s10, v8, v[2:3]
+; VI-NEXT: v_mad_u64_u32 v[6:7], s[0:1], s9, v6, v[4:5]
+; VI-NEXT: v_mov_b32_e32 v8, s8
+; VI-NEXT: v_mad_u64_u32 v[1:2], s[0:1], s14, v8, v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, v7
; VI-NEXT: v_mov_b32_e32 v7, v5
-; VI-NEXT: v_mov_b32_e32 v8, s9
-; VI-NEXT: v_mad_u64_u32 v[4:5], s[6:7], s4, v8, v[6:7]
-; VI-NEXT: s_mul_i32 s8, s11, s4
-; VI-NEXT: v_add_u32_e32 v6, vcc, s8, v2
+; VI-NEXT: v_mov_b32_e32 v8, s13
+; VI-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s8, v8, v[6:7]
+; VI-NEXT: s_mul_i32 s2, s15, s8
+; VI-NEXT: v_add_u32_e32 v6, vcc, s2, v2
; VI-NEXT: v_mov_b32_e32 v2, v5
; VI-NEXT: v_add_u32_e32 v2, vcc, v3, v2
-; VI-NEXT: v_addc_u32_e64 v3, s[6:7], 0, 0, vcc
-; VI-NEXT: s_mul_i32 s8, s10, s5
-; VI-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s5, v8, v[2:3]
-; VI-NEXT: v_add_u32_e32 v5, vcc, s8, v6
+; VI-NEXT: v_addc_u32_e64 v3, s[0:1], 0, 0, vcc
+; VI-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s9, v8, v[2:3]
+; VI-NEXT: s_mul_i32 s2, s14, s9
+; VI-NEXT: v_add_u32_e32 v5, vcc, s2, v6
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc
; VI-NEXT: v_mov_b32_e32 v1, v4
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_mul_i128:
@@ -2813,53 +2817,53 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a,
;
; GFX10-LABEL: s_mul_i128:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: s_clause 0x2
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c
; GFX10-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x7c
+; GFX10-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24
; GFX10-NEXT: s_mov_b32 s2, 0
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX10-NEXT: s_mov_b32 s13, s2
+; GFX10-NEXT: s_mov_b32 s1, s2
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_mul_i32 s3, s8, s7
-; GFX10-NEXT: s_mul_hi_u32 s7, s8, s6
+; GFX10-NEXT: s_mul_i32 s0, s8, s7
+; GFX10-NEXT: s_mul_hi_u32 s3, s8, s6
; GFX10-NEXT: s_mul_i32 s14, s10, s5
; GFX10-NEXT: s_mul_hi_u32 s15, s10, s4
-; GFX10-NEXT: s_mul_i32 s12, s9, s6
+; GFX10-NEXT: s_mul_i32 s7, s9, s6
; GFX10-NEXT: s_mul_i32 s11, s11, s4
-; GFX10-NEXT: s_add_i32 s3, s7, s3
-; GFX10-NEXT: s_add_i32 s7, s15, s14
+; GFX10-NEXT: s_add_i32 s0, s3, s0
+; GFX10-NEXT: s_add_i32 s3, s15, s14
; GFX10-NEXT: s_mul_i32 s6, s8, s6
; GFX10-NEXT: s_mul_i32 s10, s10, s4
-; GFX10-NEXT: s_add_i32 s3, s3, s12
-; GFX10-NEXT: s_add_i32 s7, s7, s11
+; GFX10-NEXT: s_add_i32 s0, s0, s7
+; GFX10-NEXT: s_add_i32 s3, s3, s11
; GFX10-NEXT: s_mul_i32 s19, s5, s8
; GFX10-NEXT: s_mul_hi_u32 s20, s4, s8
; GFX10-NEXT: s_add_u32 s6, s10, s6
; GFX10-NEXT: s_mul_hi_u32 s18, s5, s8
-; GFX10-NEXT: s_addc_u32 s7, s7, s3
+; GFX10-NEXT: s_addc_u32 s7, s3, s0
; GFX10-NEXT: s_mul_i32 s17, s4, s9
-; GFX10-NEXT: s_add_u32 s3, s19, s20
+; GFX10-NEXT: s_add_u32 s0, s19, s20
; GFX10-NEXT: s_mul_hi_u32 s16, s4, s9
; GFX10-NEXT: s_mul_hi_u32 s21, s5, s9
; GFX10-NEXT: s_mul_i32 s5, s5, s9
; GFX10-NEXT: s_addc_u32 s9, s18, 0
-; GFX10-NEXT: s_add_u32 s3, s17, s3
+; GFX10-NEXT: s_add_u32 s3, s17, s0
; GFX10-NEXT: s_addc_u32 s10, s16, 0
-; GFX10-NEXT: s_mul_i32 s12, s4, s8
+; GFX10-NEXT: s_mul_i32 s0, s4, s8
; GFX10-NEXT: s_add_u32 s4, s9, s10
; GFX10-NEXT: s_addc_u32 s8, 0, 0
; GFX10-NEXT: s_add_u32 s4, s5, s4
; GFX10-NEXT: s_addc_u32 s5, s21, s8
; GFX10-NEXT: s_add_u32 s4, s4, s6
; GFX10-NEXT: s_addc_u32 s5, s5, s7
-; GFX10-NEXT: s_or_b64 s[2:3], s[12:13], s[2:3]
+; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX10-NEXT: v_mov_b32_e32 v2, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: v_mov_b32_e32 v3, s5
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
-; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX10-NEXT: s_mov_b32 s15, 0x31016000
+; GFX10-NEXT: s_mov_b32 s14, -1
+; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[12:15], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_mul_i128:
@@ -2867,50 +2871,50 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a,
; GFX11-NEXT: s_clause 0x2
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x4c
; GFX11-NEXT: s_load_b128 s[8:11], s[0:1], 0x7c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[12:13], s[0:1], 0x24
; GFX11-NEXT: s_mov_b32 s2, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_mov_b32 s13, s2
+; GFX11-NEXT: s_mov_b32 s1, s2
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mul_i32 s3, s8, s7
-; GFX11-NEXT: s_mul_hi_u32 s7, s8, s6
+; GFX11-NEXT: s_mul_i32 s0, s8, s7
+; GFX11-NEXT: s_mul_hi_u32 s3, s8, s6
; GFX11-NEXT: s_mul_i32 s14, s10, s5
; GFX11-NEXT: s_mul_hi_u32 s15, s10, s4
-; GFX11-NEXT: s_mul_i32 s12, s9, s6
+; GFX11-NEXT: s_mul_i32 s7, s9, s6
; GFX11-NEXT: s_mul_i32 s11, s11, s4
-; GFX11-NEXT: s_add_i32 s3, s7, s3
-; GFX11-NEXT: s_add_i32 s7, s15, s14
+; GFX11-NEXT: s_add_i32 s0, s3, s0
+; GFX11-NEXT: s_add_i32 s3, s15, s14
; GFX11-NEXT: s_mul_i32 s6, s8, s6
; GFX11-NEXT: s_mul_i32 s10, s10, s4
-; GFX11-NEXT: s_add_i32 s3, s3, s12
-; GFX11-NEXT: s_add_i32 s7, s7, s11
+; GFX11-NEXT: s_add_i32 s0, s0, s7
+; GFX11-NEXT: s_add_i32 s3, s3, s11
; GFX11-NEXT: s_mul_i32 s19, s5, s8
; GFX11-NEXT: s_mul_hi_u32 s20, s4, s8
; GFX11-NEXT: s_add_u32 s6, s10, s6
; GFX11-NEXT: s_mul_hi_u32 s18, s5, s8
-; GFX11-NEXT: s_addc_u32 s7, s7, s3
+; GFX11-NEXT: s_addc_u32 s7, s3, s0
; GFX11-NEXT: s_mul_i32 s17, s4, s9
-; GFX11-NEXT: s_add_u32 s3, s19, s20
+; GFX11-NEXT: s_add_u32 s0, s19, s20
; GFX11-NEXT: s_mul_hi_u32 s16, s4, s9
; GFX11-NEXT: s_mul_hi_u32 s21, s5, s9
; GFX11-NEXT: s_mul_i32 s5, s5, s9
; GFX11-NEXT: s_addc_u32 s9, s18, 0
-; GFX11-NEXT: s_add_u32 s3, s17, s3
+; GFX11-NEXT: s_add_u32 s3, s17, s0
; GFX11-NEXT: s_addc_u32 s10, s16, 0
-; GFX11-NEXT: s_mul_i32 s12, s4, s8
+; GFX11-NEXT: s_mul_i32 s0, s4, s8
; GFX11-NEXT: s_add_u32 s4, s9, s10
; GFX11-NEXT: s_addc_u32 s8, 0, 0
; GFX11-NEXT: s_add_u32 s4, s5, s4
; GFX11-NEXT: s_addc_u32 s5, s21, s8
; GFX11-NEXT: s_add_u32 s4, s4, s6
; GFX11-NEXT: s_addc_u32 s5, s5, s7
-; GFX11-NEXT: s_or_b64 s[2:3], s[12:13], s[2:3]
+; GFX11-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v3, s5
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
-; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s5
+; GFX11-NEXT: s_mov_b32 s15, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, -1
+; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[12:15], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2918,44 +2922,44 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a,
; GFX12-LABEL: s_mul_i128:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x7c
-; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x4c
+; GFX12-NEXT: s_load_b128 s[8:11], s[0:1], 0x7c
+; GFX12-NEXT: s_load_b128 s[12:15], s[0:1], 0x4c
; GFX12-NEXT: s_mov_b32 s3, 0
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GFX12-NEXT: s_mov_b32 s15, s3
-; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
+; GFX12-NEXT: s_mov_b32 s7, s3
+; GFX12-NEXT: s_mov_b32 s1, s3
; GFX12-NEXT: s_mov_b32 s17, s3
; GFX12-NEXT: s_mov_b32 s19, s3
; GFX12-NEXT: s_mov_b32 s24, s3
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s2, s4
-; GFX12-NEXT: s_mov_b32 s14, s8
-; GFX12-NEXT: s_mov_b32 s12, s9
-; GFX12-NEXT: s_mul_u64 s[22:23], s[14:15], s[2:3]
-; GFX12-NEXT: s_mul_u64 s[20:21], s[12:13], s[2:3]
+; GFX12-NEXT: s_mov_b32 s2, s8
+; GFX12-NEXT: s_mov_b32 s6, s12
+; GFX12-NEXT: s_mov_b32 s0, s13
+; GFX12-NEXT: s_mul_u64 s[22:23], s[6:7], s[2:3]
+; GFX12-NEXT: s_mul_u64 s[20:21], s[0:1], s[2:3]
; GFX12-NEXT: s_mov_b32 s2, s23
-; GFX12-NEXT: s_mov_b32 s16, s5
-; GFX12-NEXT: s_mul_u64 s[4:5], s[4:5], s[10:11]
-; GFX12-NEXT: s_add_nc_u64 s[10:11], s[20:21], s[2:3]
-; GFX12-NEXT: s_mul_u64 s[6:7], s[6:7], s[8:9]
-; GFX12-NEXT: s_mul_u64 s[8:9], s[14:15], s[16:17]
-; GFX12-NEXT: s_mov_b32 s2, s11
-; GFX12-NEXT: s_mov_b32 s11, s3
-; GFX12-NEXT: s_add_nc_u64 s[4:5], s[6:7], s[4:5]
-; GFX12-NEXT: s_add_nc_u64 s[6:7], s[8:9], s[10:11]
-; GFX12-NEXT: s_mul_u64 s[12:13], s[12:13], s[16:17]
+; GFX12-NEXT: s_mov_b32 s16, s9
+; GFX12-NEXT: s_mul_u64 s[10:11], s[10:11], s[12:13]
+; GFX12-NEXT: s_add_nc_u64 s[12:13], s[20:21], s[2:3]
+; GFX12-NEXT: s_mul_u64 s[6:7], s[6:7], s[16:17]
+; GFX12-NEXT: s_mov_b32 s2, s13
+; GFX12-NEXT: s_mov_b32 s13, s3
+; GFX12-NEXT: s_mul_u64 s[8:9], s[8:9], s[14:15]
+; GFX12-NEXT: s_add_nc_u64 s[6:7], s[6:7], s[12:13]
+; GFX12-NEXT: s_mul_u64 s[0:1], s[0:1], s[16:17]
; GFX12-NEXT: s_mov_b32 s18, s7
; GFX12-NEXT: s_mov_b32 s23, s3
; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[18:19]
+; GFX12-NEXT: s_add_nc_u64 s[8:9], s[10:11], s[8:9]
; GFX12-NEXT: s_mov_b32 s25, s6
-; GFX12-NEXT: s_add_nc_u64 s[2:3], s[12:13], s[2:3]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
; GFX12-NEXT: s_or_b64 s[6:7], s[22:23], s[24:25]
-; GFX12-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[8:9]
; GFX12-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
-; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
-; GFX12-NEXT: s_mov_b32 s3, 0x31016000
-; GFX12-NEXT: s_mov_b32 s2, -1
-; GFX12-NEXT: buffer_store_b128 v[0:3], off, s[0:3], null
+; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: s_mov_b32 s7, 0x31016000
+; GFX12-NEXT: s_mov_b32 s6, -1
+; GFX12-NEXT: buffer_store_b128 v[0:3], off, s[4:7], null
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -3067,15 +3071,15 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
;
; VI-LABEL: v_mul_i128:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v0
; VI-NEXT: v_mov_b32_e32 v11, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_add_u32_e32 v8, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_add_u32_e32 v8, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: flat_load_dwordx4 v[4:7], v[8:9]
@@ -3107,12 +3111,12 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
;
; GFX9-LABEL: v_mul_i128:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; GFX9-NEXT: v_lshlrev_b32_e32 v13, 4, v0
; GFX9-NEXT: v_mov_b32_e32 v10, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v13, s[0:1]
-; GFX9-NEXT: global_load_dwordx4 v[4:7], v13, s[2:3]
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v13, s[4:5]
+; GFX9-NEXT: global_load_dwordx4 v[4:7], v13, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0
; GFX9-NEXT: v_mul_lo_u32 v14, v5, v2
@@ -3133,18 +3137,18 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
; GFX9-NEXT: v_add3_u32 v3, v16, v3, v4
; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v1, v3, vcc
-; GFX9-NEXT: global_store_dwordx4 v13, v[8:11], s[2:3]
+; GFX9-NEXT: global_store_dwordx4 v13, v[8:11], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_mul_i128:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
; GFX10-NEXT: v_lshlrev_b32_e32 v13, 4, v0
; GFX10-NEXT: v_mov_b32_e32 v10, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v13, s[0:1]
-; GFX10-NEXT: global_load_dwordx4 v[4:7], v13, s[2:3]
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v13, s[4:5]
+; GFX10-NEXT: global_load_dwordx4 v[4:7], v13, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mad_u64_u32 v[8:9], s0, v0, v4, 0
; GFX10-NEXT: v_mul_lo_u32 v15, v5, v2
@@ -3165,17 +3169,17 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
; GFX10-NEXT: v_add3_u32 v3, v7, v3, v12
; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v0, v2
; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo
-; GFX10-NEXT: global_store_dwordx4 v13, v[8:11], s[2:3]
+; GFX10-NEXT: global_store_dwordx4 v13, v[8:11], s[6:7]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul_i128:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x2c
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
; GFX11-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v15, 4, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b128 v[0:3], v15, s[0:1]
-; GFX11-NEXT: global_load_b128 v[4:7], v15, s[2:3]
+; GFX11-NEXT: global_load_b128 v[0:3], v15, s[4:5]
+; GFX11-NEXT: global_load_b128 v[4:7], v15, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v0, v4, 0
; GFX11-NEXT: v_mul_lo_u32 v14, v5, v2
@@ -3201,19 +3205,19 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, v6, v13
; GFX11-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v7, v0, vcc_lo
-; GFX11-NEXT: global_store_b128 v15, v[8:11], s[2:3]
+; GFX11-NEXT: global_store_b128 v15, v[8:11], s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_mul_i128:
; GFX12: ; %bb.0: ; %entry
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x2c
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
; GFX12-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v13, 4, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_load_b128 v[0:3], v13, s[0:1]
-; GFX12-NEXT: global_load_b128 v[4:7], v13, s[2:3]
+; GFX12-NEXT: global_load_b128 v[0:3], v13, s[4:5]
+; GFX12-NEXT: global_load_b128 v[4:7], v13, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v0, v4, 0
; GFX12-NEXT: v_mul_lo_u32 v15, v5, v2
@@ -3240,7 +3244,7 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_add_co_u32 v10, vcc_lo, v0, v2
; GFX12-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo
-; GFX12-NEXT: global_store_b128 v13, v[8:11], s[2:3]
+; GFX12-NEXT: global_store_b128 v13, v[8:11], s[6:7]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
index 6d7bf0027a822..4770b44725409 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
@@ -291,18 +291,18 @@ define amdgpu_kernel void @test_smul24_i64(ptr addrspace(1) %out, [8 x i32], i32
;
; VI-LABEL: test_smul24_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x4c
-; VI-NEXT: s_load_dword s5, s[0:1], 0x70
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x4c
+; VI-NEXT: s_load_dword s3, s[0:1], 0x70
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bfe_i32 s4, s4, 0x180000
-; VI-NEXT: s_bfe_i32 s5, s5, 0x180000
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s5, v0
-; VI-NEXT: v_mul_i32_i24_e32 v0, s5, v0
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_bfe_i32 s0, s2, 0x180000
+; VI-NEXT: s_bfe_i32 s1, s3, 0x180000
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s1, v0
+; VI-NEXT: v_mul_i32_i24_e32 v0, s1, v0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_smul24_i64:
@@ -390,15 +390,15 @@ define amdgpu_kernel void @test_smul24_i64_square(ptr addrspace(1) %out, i32 %a,
;
; VI-LABEL: test_smul24_i64_square:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bfe_i32 s4, s4, 0x180000
-; VI-NEXT: v_mul_hi_i32_i24_e64 v1, s4, s4
-; VI-NEXT: v_mul_i32_i24_e64 v0, s4, s4
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_bfe_i32 s0, s2, 0x180000
+; VI-NEXT: v_mul_hi_i32_i24_e64 v1, s0, s0
+; VI-NEXT: v_mul_i32_i24_e64 v0, s0, s0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_smul24_i64_square:
@@ -485,21 +485,21 @@ define amdgpu_kernel void @test_smul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b
; VI-LABEL: test_smul24_i33:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s3, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b32 s3, s2, 8
-; VI-NEXT: s_lshl_b32 s5, s4, 8
-; VI-NEXT: s_ashr_i64 s[4:5], s[4:5], 40
+; VI-NEXT: s_lshl_b32 s1, s2, 8
+; VI-NEXT: s_lshl_b32 s3, s3, 8
; VI-NEXT: s_ashr_i64 s[2:3], s[2:3], 40
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s2, v0
-; VI-NEXT: v_mul_i32_i24_e32 v0, s2, v0
+; VI-NEXT: s_ashr_i64 s[0:1], s[0:1], 40
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mul_hi_i32_i24_e32 v1, s0, v0
+; VI-NEXT: v_mul_i32_i24_e32 v0, s0, v0
; VI-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1]
-; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: v_ashrrev_i64 v[0:1], 31, v[0:1]
-; VI-NEXT: s_mov_b32 s2, -1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_smul24_i33:
@@ -594,16 +594,16 @@ define amdgpu_kernel void @test_smulhi24_i33(ptr addrspace(1) %out, i33 %a, i33
;
; VI-LABEL: test_smulhi24_i33:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dword s5, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dword s3, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s5, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mul_hi_i32_i24_e32 v0, s3, v0
; VI-NEXT: v_and_b32_e32 v0, 1, v0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_smulhi24_i33:
@@ -702,16 +702,16 @@ define amdgpu_kernel void @simplify_i24_crash(ptr addrspace(1) %out, i32 %arg0,
; VI-NEXT: s_endpgm
; VI-NEXT: .LBB8_2: ; %bb11
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s11, 0xf000
+; VI-NEXT: s_mov_b32 s10, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bfe_i32 s4, s4, 0x180000
-; VI-NEXT: s_bfe_i32 s5, s6, 0x180000
-; VI-NEXT: s_mul_i32 s4, s4, s5
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s4
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_bfe_i32 s0, s4, 0x180000
+; VI-NEXT: s_bfe_i32 s1, s6, 0x180000
+; VI-NEXT: s_mul_i32 s0, s0, s1
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: simplify_i24_crash:
diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
index e6470a5833d3a..7c43c0b25301e 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
@@ -78,16 +78,16 @@ define amdgpu_kernel void @test_umul24_i16_sext(ptr addrspace(1) %out, i16 %a, i
;
; VI-LABEL: test_umul24_i16_sext:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s5, s4, 16
-; VI-NEXT: s_mul_i32 s4, s4, s5
-; VI-NEXT: s_sext_i32_i16 s4, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_lshr_b32 s0, s2, 16
+; VI-NEXT: s_mul_i32 s2, s2, s0
+; VI-NEXT: s_sext_i32_i16 s0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umul24_i16_sext:
@@ -136,40 +136,40 @@ define amdgpu_kernel void @test_umul24_i16_vgpr_sext(ptr addrspace(1) %out, ptr
;
; VI-LABEL: test_umul24_i16_vgpr_sext:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v1
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v2, v[2:3]
; VI-NEXT: flat_load_ushort v0, v[0:1]
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_lo_u16_e32 v0, v2, v0
; VI-NEXT: v_bfe_i32 v0, v0, 0, 16
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umul24_i16_vgpr_sext:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v2, v0, s[2:3]
-; GFX9-NEXT: global_load_ushort v3, v1, s[2:3]
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: global_load_ushort v2, v0, s[6:7]
+; GFX9-NEXT: global_load_ushort v3, v1, s[6:7]
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_lo_u16_e32 v0, v2, v3
; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%tid.y = call i32 @llvm.amdgcn.workitem.id.y()
@@ -200,16 +200,16 @@ define amdgpu_kernel void @test_umul24_i16(ptr addrspace(1) %out, i16 %a, i16 %b
;
; VI-LABEL: test_umul24_i16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s5, s4, 16
-; VI-NEXT: s_mul_i32 s4, s4, s5
-; VI-NEXT: s_and_b32 s4, s4, 0xffff
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_lshr_b32 s0, s2, 16
+; VI-NEXT: s_mul_i32 s2, s2, s0
+; VI-NEXT: s_and_b32 s0, s2, 0xffff
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umul24_i16:
@@ -258,38 +258,38 @@ define amdgpu_kernel void @test_umul24_i16_vgpr(ptr addrspace(1) %out, ptr addrs
;
; VI-LABEL: test_umul24_i16_vgpr:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v1
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v2, v[2:3]
; VI-NEXT: flat_load_ushort v0, v[0:1]
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mul_lo_u16_e32 v0, v2, v0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umul24_i16_vgpr:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v2, v0, s[2:3]
-; GFX9-NEXT: global_load_ushort v3, v1, s[2:3]
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: global_load_ushort v2, v0, s[6:7]
+; GFX9-NEXT: global_load_ushort v3, v1, s[6:7]
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_lo_u16_e32 v0, v2, v3
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%tid.y = call i32 @llvm.amdgcn.workitem.id.y()
@@ -331,13 +331,13 @@ define amdgpu_kernel void @test_umul24_i8_vgpr(ptr addrspace(1) %out, ptr addrsp
; VI-LABEL: test_umul24_i8_vgpr:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; VI-NEXT: v_mov_b32_e32 v4, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v1
+; VI-NEXT: v_mov_b32_e32 v4, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc
; VI-NEXT: flat_load_ubyte v2, v[2:3]
; VI-NEXT: flat_load_ubyte v0, v[0:1]
@@ -596,14 +596,14 @@ define amdgpu_kernel void @test_umul24_i64_square(ptr addrspace(1) %out, [8 x i3
;
; VI-LABEL: test_umul24_i64_square:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x4c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mul_hi_u32_u24_e64 v1, s4, s4
-; VI-NEXT: v_mul_u32_u24_e64 v0, s4, s4
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: v_mul_hi_u32_u24_e64 v1, s2, s2
+; VI-NEXT: v_mul_u32_u24_e64 v0, s2, s2
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umul24_i64_square:
@@ -703,17 +703,17 @@ define amdgpu_kernel void @test_umul24_i33(ptr addrspace(1) %out, i33 %a, i33 %b
;
; VI-LABEL: test_umul24_i33:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dword s5, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dword s3, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s4
-; VI-NEXT: v_mul_u32_u24_e32 v0, s5, v1
-; VI-NEXT: v_mul_hi_u32_u24_e32 v1, s5, v1
+; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: v_mul_u32_u24_e32 v0, s3, v1
+; VI-NEXT: v_mul_hi_u32_u24_e32 v1, s3, v1
; VI-NEXT: v_and_b32_e32 v1, 1, v1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umul24_i33:
@@ -761,16 +761,16 @@ define amdgpu_kernel void @test_umulhi24_i33(ptr addrspace(1) %out, i33 %a, i33
;
; VI-LABEL: test_umulhi24_i33:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dword s5, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dword s3, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mul_hi_u32_u24_e32 v0, s5, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mul_hi_u32_u24_e32 v0, s3, v0
; VI-NEXT: v_and_b32_e32 v0, 1, v0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: test_umulhi24_i33:
diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
index 9ab3eccd986a5..28f6c13a302e1 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
@@ -2104,10 +2104,10 @@ define i8 @flat_inst_valu_offset_64bit_13bit_neg_high_split1(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) {
; GFX9-LABEL: flat_inst_salu_offset_1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:1 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: flat_store_byte v[0:1], v0
@@ -2115,10 +2115,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 1
-; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_add_u32 s0, s2, 1
+; GFX10-NEXT: s_addc_u32 s1, s3, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -2128,9 +2128,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) {
;
; GFX11-LABEL: flat_inst_salu_offset_1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:1 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_store_b8 v[0:1], v0
@@ -2138,9 +2138,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) {
;
; GFX12-LABEL: flat_inst_salu_offset_1:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:1 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b8 v[0:1], v0
@@ -2154,10 +2154,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_1(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) {
; GFX9-LABEL: flat_inst_salu_offset_11bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:2047 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: flat_store_byte v[0:1], v0
@@ -2165,10 +2165,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_11bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_add_u32 s0, s2, 0x7ff
+; GFX10-NEXT: s_addc_u32 s1, s3, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -2178,9 +2178,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) {
;
; GFX11-LABEL: flat_inst_salu_offset_11bit_max:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:2047 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_store_b8 v[0:1], v0
@@ -2188,9 +2188,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) {
;
; GFX12-LABEL: flat_inst_salu_offset_11bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:2047 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b8 v[0:1], v0
@@ -2204,10 +2204,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_11bit_max(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) {
; GFX9-LABEL: flat_inst_salu_offset_12bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: flat_store_byte v[0:1], v0
@@ -2215,10 +2215,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_12bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX10-NEXT: s_addc_u32 s1, s3, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -2228,9 +2228,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) {
;
; GFX11-LABEL: flat_inst_salu_offset_12bit_max:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_store_b8 v[0:1], v0
@@ -2238,9 +2238,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) {
;
; GFX12-LABEL: flat_inst_salu_offset_12bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b8 v[0:1], v0
@@ -2254,10 +2254,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_12bit_max(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_13bit_max:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 glc
@@ -2267,10 +2267,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_13bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX10-NEXT: s_addc_u32 s1, s3, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -2280,11 +2280,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_13bit_max:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -2292,9 +2292,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) {
;
; GFX12-LABEL: flat_inst_salu_offset_13bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b8 v[0:1], v0
@@ -2302,10 +2302,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_13bit_max:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2315,10 +2315,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_13bit_max:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 0
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -2334,10 +2334,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_13bit_max(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_neg_11bit_max:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2347,10 +2347,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_neg_11bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0xfffff800
-; GFX10-NEXT: s_addc_u32 s1, s1, -1
+; GFX10-NEXT: s_add_u32 s0, s2, 0xfffff800
+; GFX10-NEXT: s_addc_u32 s1, s3, -1
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -2360,11 +2360,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_neg_11bit_max:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff800, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff800, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -2372,9 +2372,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) {
;
; GFX12-LABEL: flat_inst_salu_offset_neg_11bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-2048 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b8 v[0:1], v0
@@ -2382,10 +2382,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_neg_11bit_max:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff800
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0xfffff800
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2395,10 +2395,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_neg_11bit_max:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff800
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0xfffff800
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -2414,10 +2414,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_11bit_max(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_neg_12bit_max:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2427,10 +2427,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_neg_12bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0xfffff000
-; GFX10-NEXT: s_addc_u32 s1, s1, -1
+; GFX10-NEXT: s_add_u32 s0, s2, 0xfffff000
+; GFX10-NEXT: s_addc_u32 s1, s3, -1
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -2440,11 +2440,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_neg_12bit_max:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -2452,9 +2452,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) {
;
; GFX12-LABEL: flat_inst_salu_offset_neg_12bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-4096 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b8 v[0:1], v0
@@ -2462,10 +2462,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_neg_12bit_max:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0xfffff000
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2475,10 +2475,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_neg_12bit_max:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0xfffff000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -2494,10 +2494,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_12bit_max(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_neg_13bit_max:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2507,10 +2507,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_neg_13bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0xffffe000
-; GFX10-NEXT: s_addc_u32 s1, s1, -1
+; GFX10-NEXT: s_add_u32 s0, s2, 0xffffe000
+; GFX10-NEXT: s_addc_u32 s1, s3, -1
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -2520,11 +2520,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_neg_13bit_max:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -2532,9 +2532,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) {
;
; GFX12-LABEL: flat_inst_salu_offset_neg_13bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8192 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b8 v[0:1], v0
@@ -2542,10 +2542,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_neg_13bit_max:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0xffffe000
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2555,10 +2555,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_neg_13bit_max:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0xffffe000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -2574,10 +2574,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_neg_13bit_max(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) {
; GFX9-LABEL: flat_inst_salu_offset_2x_11bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX9-NEXT: flat_store_byte v[0:1], v0
@@ -2585,10 +2585,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_2x_11bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX10-NEXT: s_addc_u32 s1, s3, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -2598,9 +2598,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) {
;
; GFX11-LABEL: flat_inst_salu_offset_2x_11bit_max:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_store_b8 v[0:1], v0
@@ -2608,9 +2608,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) {
;
; GFX12-LABEL: flat_inst_salu_offset_2x_11bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b8 v[0:1], v0
@@ -2624,10 +2624,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_11bit_max(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_2x_12bit_max:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 glc
@@ -2637,10 +2637,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_2x_12bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX10-NEXT: s_addc_u32 s1, s3, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -2650,11 +2650,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_12bit_max:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -2662,9 +2662,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) {
;
; GFX12-LABEL: flat_inst_salu_offset_2x_12bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b8 v[0:1], v0
@@ -2672,10 +2672,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_12bit_max:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2685,10 +2685,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_2x_12bit_max:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 0
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -2704,10 +2704,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_12bit_max(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_2x_13bit_max:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 glc
@@ -2717,10 +2717,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_2x_13bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x3fff
-; GFX10-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-NEXT: s_add_u32 s0, s2, 0x3fff
+; GFX10-NEXT: s_addc_u32 s1, s3, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -2730,11 +2730,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_13bit_max:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x3000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x3000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -2742,9 +2742,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) {
;
; GFX12-LABEL: flat_inst_salu_offset_2x_13bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:16383 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b8 v[0:1], v0
@@ -2752,10 +2752,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_13bit_max:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x3fff
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0x3fff
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2765,10 +2765,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_2x_13bit_max:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x3fff
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x3fff
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 0
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -2784,10 +2784,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_13bit_max(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2797,10 +2797,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0xfffff000
-; GFX10-NEXT: s_addc_u32 s1, s1, -1
+; GFX10-NEXT: s_add_u32 s0, s2, 0xfffff000
+; GFX10-NEXT: s_addc_u32 s1, s3, -1
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -2810,11 +2810,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -2822,9 +2822,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) {
;
; GFX12-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-4096 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b8 v[0:1], v0
@@ -2832,10 +2832,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0xfffff000
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2845,10 +2845,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_2x_neg_11bit_max:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0xfffff000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -2864,10 +2864,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_11bit_max(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2877,10 +2877,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0xffffe000
-; GFX10-NEXT: s_addc_u32 s1, s1, -1
+; GFX10-NEXT: s_add_u32 s0, s2, 0xffffe000
+; GFX10-NEXT: s_addc_u32 s1, s3, -1
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -2890,11 +2890,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -2902,9 +2902,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) {
;
; GFX12-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-8192 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b8 v[0:1], v0
@@ -2912,10 +2912,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0xffffe000
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2925,10 +2925,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_2x_neg_12bit_max:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0xffffe000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -2944,10 +2944,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_12bit_max(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffc000, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -2957,10 +2957,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0xffffc000
-; GFX10-NEXT: s_addc_u32 s1, s1, -1
+; GFX10-NEXT: s_add_u32 s0, s2, 0xffffc000
+; GFX10-NEXT: s_addc_u32 s1, s3, -1
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -2970,11 +2970,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -2982,9 +2982,9 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) {
;
; GFX12-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: flat_load_u8 v0, v[0:1] offset:-16384 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: flat_store_b8 v[0:1], v0
@@ -2992,10 +2992,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xffffc000
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0xffffc000
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3005,10 +3005,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_2x_neg_13bit_max:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffc000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0xffffc000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3025,10 +3025,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_2x_neg_13bit_max(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-SDAG-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] offset:2047 glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3037,10 +3037,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX10-NEXT: s_addc_u32 s1, s1, 2
+; GFX10-NEXT: s_add_u32 s0, s2, 0x7ff
+; GFX10-NEXT: s_addc_u32 s1, s3, 2
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -3050,11 +3050,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2047 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3062,11 +3062,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) {
;
; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2047 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3074,10 +3074,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0x7ff
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3087,10 +3087,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x7ff
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3100,10 +3100,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) {
;
; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split0:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x7ff
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x7ff
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 2
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
@@ -3120,10 +3120,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split0(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-SDAG-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] offset:2048 glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3132,10 +3132,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x800
-; GFX10-NEXT: s_addc_u32 s1, s1, 2
+; GFX10-NEXT: s_add_u32 s0, s2, 0x800
+; GFX10-NEXT: s_addc_u32 s1, s3, 2
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -3145,11 +3145,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2048 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3157,11 +3157,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) {
;
; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:2048 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3169,10 +3169,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x800
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0x800
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3182,10 +3182,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x800
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x800
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3195,10 +3195,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) {
;
; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_split1:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x800
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 2
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
@@ -3215,10 +3215,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_split1(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-SDAG-NEXT: v_add_co_u32_e64 v0, vcc, 0, s0
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT: v_add_co_u32_e64 v0, vcc, 0, s2
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 glc
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3227,10 +3227,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX10-NEXT: s_addc_u32 s1, s1, 2
+; GFX10-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX10-NEXT: s_addc_u32 s1, s3, 2
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -3240,11 +3240,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3252,11 +3252,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) {
;
; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3264,10 +3264,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3277,10 +3277,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3290,10 +3290,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) {
;
; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split0:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfff
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0xfff
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 2
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
@@ -3310,10 +3310,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split0(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3323,10 +3323,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x1000
-; GFX10-NEXT: s_addc_u32 s1, s1, 2
+; GFX10-NEXT: s_add_u32 s0, s2, 0x1000
+; GFX10-NEXT: s_addc_u32 s1, s3, 2
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -3336,11 +3336,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3348,11 +3348,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) {
;
; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4096 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3360,10 +3360,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1000
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0x1000
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3373,10 +3373,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x1000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3386,10 +3386,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) {
;
; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_split1:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1000
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x1000
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 2
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
@@ -3406,10 +3406,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_split1(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] offset:4095 glc
@@ -3419,10 +3419,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX10-NEXT: s_addc_u32 s1, s1, 2
+; GFX10-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX10-NEXT: s_addc_u32 s1, s3, 2
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -3432,11 +3432,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3444,11 +3444,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) {
;
; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8191 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3456,10 +3456,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3469,10 +3469,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3482,10 +3482,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) {
;
; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split0:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1fff
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x1fff
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 2
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
@@ -3502,10 +3502,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split0(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 2, v1, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3515,10 +3515,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) {
;
; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x2000
-; GFX10-NEXT: s_addc_u32 s1, s1, 2
+; GFX10-NEXT: s_add_u32 s0, s2, 0x2000
+; GFX10-NEXT: s_addc_u32 s1, s3, 2
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -3528,11 +3528,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) {
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x2000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x2000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3540,11 +3540,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) {
;
; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8192 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: flat_store_b8 v[0:1], v0
@@ -3552,10 +3552,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) {
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x2000
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0x2000
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3565,10 +3565,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) {
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x2000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x2000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3578,10 +3578,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) {
;
; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_split1:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x2000
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x2000
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 2
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
@@ -3598,11 +3598,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_split1(ptr %p) {
define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x7ff, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3612,10 +3612,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr
;
; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX10-NEXT: s_add_u32 s0, s2, 0x7ff
+; GFX10-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -3625,10 +3625,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, s0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x7ff, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3638,10 +3638,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr
;
; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8386561 scope:SCOPE_SYS
@@ -3651,10 +3651,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0x7ff
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3664,10 +3664,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x7ff
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3677,10 +3677,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr
;
; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split0:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x7ff
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x7ff
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0x80000000
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
@@ -3697,11 +3697,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split0(ptr
define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x800, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3711,10 +3711,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr
;
; GFX10-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x800
-; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX10-NEXT: s_add_u32 s0, s2, 0x800
+; GFX10-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -3724,10 +3724,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, s0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3737,10 +3737,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr
;
; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8386560 scope:SCOPE_SYS
@@ -3750,10 +3750,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x800
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0x800
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3763,10 +3763,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x800
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x800
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3776,10 +3776,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr
;
; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_11bit_neg_high_split1:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x800
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0x80000000
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
@@ -3796,11 +3796,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_11bit_neg_high_split1(ptr
define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfff, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3810,10 +3810,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr
;
; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX10-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX10-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -3823,10 +3823,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, s0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0xfff, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3836,10 +3836,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr
;
; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8384513 scope:SCOPE_SYS
@@ -3849,10 +3849,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3862,10 +3862,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3875,10 +3875,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr
;
; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split0:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfff
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0xfff
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0x80000000
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
@@ -3895,11 +3895,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split0(ptr
define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3909,10 +3909,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr
;
; GFX10-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x1000
-; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX10-NEXT: s_add_u32 s0, s2, 0x1000
+; GFX10-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -3922,10 +3922,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, s0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3935,10 +3935,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr
;
; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8384512 scope:SCOPE_SYS
@@ -3948,10 +3948,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1000
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0x1000
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -3961,10 +3961,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x1000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -3974,10 +3974,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr
;
; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_12bit_neg_high_split1:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1000
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x1000
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0x80000000
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
@@ -3994,11 +3994,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_12bit_neg_high_split1(ptr
define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x1fff, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -4008,10 +4008,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr
;
; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX10-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX10-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -4021,10 +4021,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, s0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1fff, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -4034,10 +4034,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr
;
; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8380417 scope:SCOPE_SYS
@@ -4047,10 +4047,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -4060,10 +4060,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -4073,10 +4073,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr
;
; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split0:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1fff
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x1fff
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0x80000000
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
@@ -4093,11 +4093,11 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split0(ptr
define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr %p) {
; GFX9-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_bfrev_b32_e32 v1, 1
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, s3
; GFX9-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0x2000, v0
; GFX9-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
; GFX9-SDAG-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -4107,10 +4107,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr
;
; GFX10-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x2000
-; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX10-NEXT: s_add_u32 s0, s2, 0x2000
+; GFX10-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: flat_load_ubyte v0, v[0:1] glc dlc
@@ -4120,10 +4120,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr
;
; GFX11-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, s0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x2000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX11-SDAG-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -4133,10 +4133,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr
;
; GFX12-SDAG-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX12-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0x80000000, v1, vcc_lo
; GFX12-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8380416 scope:SCOPE_SYS
@@ -4146,10 +4146,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr
;
; GFX9-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_add_u32 s0, s0, 0x2000
-; GFX9-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX9-GISEL-NEXT: s_add_u32 s0, s2, 0x2000
+; GFX9-GISEL-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-GISEL-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -4159,10 +4159,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr
;
; GFX11-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x2000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x2000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: flat_load_u8 v0, v[0:1] glc dlc
@@ -4172,10 +4172,10 @@ define amdgpu_kernel void @flat_inst_salu_offset_64bit_13bit_neg_high_split1(ptr
;
; GFX12-GISEL-LABEL: flat_inst_salu_offset_64bit_13bit_neg_high_split1:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x2000
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x2000
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0x80000000
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: flat_load_u8 v0, v[0:1] scope:SCOPE_SYS
diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
index 10381bc21ecc9..8dcca320f2462 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
@@ -2176,30 +2176,30 @@ define i8 @global_inst_valu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1)
define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:1 glc
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:1 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_inst_salu_offset_1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:1 glc dlc
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:1 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_inst_salu_offset_1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:1 glc dlc
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:1 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-NEXT: s_nop 0
@@ -2208,10 +2208,10 @@ define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) {
;
; GFX12-LABEL: global_inst_salu_offset_1:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:1 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:1 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-NEXT: s_nop 0
@@ -2226,30 +2226,30 @@ define amdgpu_kernel void @global_inst_salu_offset_1(ptr addrspace(1) %p) {
define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_11bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_inst_salu_offset_11bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_inst_salu_offset_11bit_max:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:2047 glc dlc
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:2047 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-NEXT: s_nop 0
@@ -2258,10 +2258,10 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p
;
; GFX12-LABEL: global_inst_salu_offset_11bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:2047 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:2047 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-NEXT: s_nop 0
@@ -2276,30 +2276,30 @@ define amdgpu_kernel void @global_inst_salu_offset_11bit_max(ptr addrspace(1) %p
define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_12bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_inst_salu_offset_12bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0x800
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_inst_salu_offset_12bit_max:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-NEXT: s_nop 0
@@ -2308,10 +2308,10 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p
;
; GFX12-LABEL: global_inst_salu_offset_12bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-NEXT: s_nop 0
@@ -2326,30 +2326,30 @@ define amdgpu_kernel void @global_inst_salu_offset_12bit_max(ptr addrspace(1) %p
define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_13bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x1000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_inst_salu_offset_13bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0x1800
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_inst_salu_offset_13bit_max:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0x1000
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-NEXT: s_nop 0
@@ -2358,10 +2358,10 @@ define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p
;
; GFX12-LABEL: global_inst_salu_offset_13bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:8191 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:8191 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-NEXT: s_nop 0
@@ -2376,30 +2376,30 @@ define amdgpu_kernel void @global_inst_salu_offset_13bit_max(ptr addrspace(1) %p
define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_neg_11bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-2048 glc
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2048 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_inst_salu_offset_neg_11bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-2048 glc dlc
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2048 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_inst_salu_offset_neg_11bit_max:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:-2048 glc dlc
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2048 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-NEXT: s_nop 0
@@ -2408,10 +2408,10 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1
;
; GFX12-LABEL: global_inst_salu_offset_neg_11bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-2048 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-2048 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-NEXT: s_nop 0
@@ -2426,20 +2426,20 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_11bit_max(ptr addrspace(1
define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_neg_12bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-4096 glc
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-4096 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: global_inst_salu_offset_neg_12bit_max:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000
-; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX10-GISEL-NEXT: s_add_u32 s0, s2, 0xfffff000
+; GFX10-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
@@ -2449,10 +2449,10 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1
;
; GFX11-LABEL: global_inst_salu_offset_neg_12bit_max:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-4096 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-NEXT: s_nop 0
@@ -2461,10 +2461,10 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1
;
; GFX12-LABEL: global_inst_salu_offset_neg_12bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-4096 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-NEXT: s_nop 0
@@ -2473,10 +2473,10 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1
;
; GFX10-SDAG-LABEL: global_inst_salu_offset_neg_12bit_max:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s2
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s3, s0
; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
@@ -2490,11 +2490,11 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_12bit_max(ptr addrspace(1
define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_neg_13bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0xffffe000
-; GFX9-NEXT: s_addc_u32 s1, s1, -1
+; GFX9-NEXT: s_add_u32 s0, s2, 0xffffe000
+; GFX9-NEXT: s_addc_u32 s1, s3, -1
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -2502,10 +2502,10 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1
;
; GFX10-GISEL-LABEL: global_inst_salu_offset_neg_13bit_max:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000
-; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX10-GISEL-NEXT: s_add_u32 s0, s2, 0xffffe000
+; GFX10-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
@@ -2515,10 +2515,10 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1
;
; GFX11-GISEL-LABEL: global_inst_salu_offset_neg_13bit_max:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0xffffe000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc
@@ -2530,10 +2530,10 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1
;
; GFX12-LABEL: global_inst_salu_offset_neg_13bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-8192 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-8192 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-NEXT: s_nop 0
@@ -2542,10 +2542,10 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1
;
; GFX10-SDAG-LABEL: global_inst_salu_offset_neg_13bit_max:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s2
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s3, s0
; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
@@ -2553,11 +2553,11 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1
;
; GFX11-SDAG-LABEL: global_inst_salu_offset_neg_13bit_max:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -2573,30 +2573,30 @@ define amdgpu_kernel void @global_inst_salu_offset_neg_13bit_max(ptr addrspace(1
define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_2x_11bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_inst_salu_offset_2x_11bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0x800
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_inst_salu_offset_2x_11bit_max:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-NEXT: s_nop 0
@@ -2605,10 +2605,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1)
;
; GFX12-LABEL: global_inst_salu_offset_2x_11bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-NEXT: s_nop 0
@@ -2623,30 +2623,30 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_11bit_max(ptr addrspace(1)
define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_2x_12bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x1000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_inst_salu_offset_2x_12bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0x1800
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_inst_salu_offset_2x_12bit_max:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0x1000
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-NEXT: s_nop 0
@@ -2655,10 +2655,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1)
;
; GFX12-LABEL: global_inst_salu_offset_2x_12bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:8191 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:8191 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-NEXT: s_nop 0
@@ -2673,30 +2673,30 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_12bit_max(ptr addrspace(1)
define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_2x_13bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0x3000
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:4095 glc
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: global_inst_salu_offset_2x_13bit_max:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0x3800
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] offset:2047 glc dlc
+; GFX10-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_inst_salu_offset_2x_13bit_max:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0x3000
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:4095 glc dlc
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:4095 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-NEXT: s_nop 0
@@ -2705,10 +2705,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1)
;
; GFX12-LABEL: global_inst_salu_offset_2x_13bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:16383 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:16383 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-NEXT: s_nop 0
@@ -2723,20 +2723,20 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_13bit_max(ptr addrspace(1)
define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] offset:-4096 glc
+; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-4096 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
; GFX9-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xfffff000
-; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX10-GISEL-NEXT: s_add_u32 s0, s2, 0xfffff000
+; GFX10-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
@@ -2746,10 +2746,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac
;
; GFX11-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 glc dlc
+; GFX11-NEXT: global_load_u8 v0, v0, s[2:3] offset:-4096 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
; GFX11-NEXT: s_nop 0
@@ -2758,10 +2758,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac
;
; GFX12-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-4096 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-4096 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-NEXT: s_nop 0
@@ -2770,10 +2770,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac
;
; GFX10-SDAG-LABEL: global_inst_salu_offset_2x_neg_11bit_max:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s0
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xfffff000, s2
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s3, s0
; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
@@ -2787,11 +2787,11 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_11bit_max(ptr addrspac
define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0xffffe000
-; GFX9-NEXT: s_addc_u32 s1, s1, -1
+; GFX9-NEXT: s_add_u32 s0, s2, 0xffffe000
+; GFX9-NEXT: s_addc_u32 s1, s3, -1
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -2799,10 +2799,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac
;
; GFX10-GISEL-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000
-; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX10-GISEL-NEXT: s_add_u32 s0, s2, 0xffffe000
+; GFX10-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
@@ -2812,10 +2812,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac
;
; GFX11-GISEL-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffe000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0xffffe000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc
@@ -2827,10 +2827,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac
;
; GFX12-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-8192 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-8192 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-NEXT: s_nop 0
@@ -2839,10 +2839,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac
;
; GFX10-SDAG-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s2
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s3, s0
; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
@@ -2850,11 +2850,11 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac
;
; GFX11-SDAG-LABEL: global_inst_salu_offset_2x_neg_12bit_max:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffe000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -2870,11 +2870,11 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_12bit_max(ptr addrspac
define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0xffffc000
-; GFX9-NEXT: s_addc_u32 s1, s1, -1
+; GFX9-NEXT: s_add_u32 s0, s2, 0xffffc000
+; GFX9-NEXT: s_addc_u32 s1, s3, -1
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -2882,10 +2882,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac
;
; GFX10-GISEL-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xffffc000
-; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX10-GISEL-NEXT: s_add_u32 s0, s2, 0xffffc000
+; GFX10-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
@@ -2895,10 +2895,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac
;
; GFX11-GISEL-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xffffc000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, -1
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0xffffc000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, -1
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc
@@ -2910,10 +2910,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac
;
; GFX12-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u8 v0, v0, s[0:1] offset:-16384 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u8 v0, v0, s[2:3] offset:-16384 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v[0:1], v0, off
; GFX12-NEXT: s_nop 0
@@ -2922,10 +2922,10 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac
;
; GFX10-SDAG-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s0
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s1, s0
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s2
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, -1, s3, s0
; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
@@ -2933,11 +2933,11 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac
;
; GFX11-SDAG-LABEL: global_inst_salu_offset_2x_neg_13bit_max:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0xffffc000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -2954,11 +2954,11 @@ define amdgpu_kernel void @global_inst_salu_offset_2x_neg_13bit_max(ptr addrspac
define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_split0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX9-NEXT: s_addc_u32 s1, s1, 2
+; GFX9-NEXT: s_add_u32 s0, s2, 0x7ff
+; GFX9-NEXT: s_addc_u32 s1, s3, 2
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -2966,10 +2966,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp
;
; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split0:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX10-GISEL-NEXT: s_add_u32 s0, s2, 0x7ff
+; GFX10-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
@@ -2979,10 +2979,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp
;
; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split0:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x7ff
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc
@@ -2994,10 +2994,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split0:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x7ff
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x7ff
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 2
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off scope:SCOPE_SYS
@@ -3009,10 +3009,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp
;
; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split0:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s3, s0
; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 glc dlc
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
@@ -3020,11 +3020,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp
;
; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split0:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2047 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3034,11 +3034,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp
;
; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split0:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2047 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3055,11 +3055,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split0(ptr addrsp
define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_split1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0x800
-; GFX9-NEXT: s_addc_u32 s1, s1, 2
+; GFX9-NEXT: s_add_u32 s0, s2, 0x800
+; GFX9-NEXT: s_addc_u32 s1, s3, 2
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -3067,10 +3067,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp
;
; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split1:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0x800
-; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX10-GISEL-NEXT: s_add_u32 s0, s2, 0x800
+; GFX10-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
@@ -3080,10 +3080,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp
;
; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split1:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x800
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x800
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc
@@ -3095,10 +3095,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_split1:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x800
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 2
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off scope:SCOPE_SYS
@@ -3110,10 +3110,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp
;
; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split1:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x800, s0
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x800, s2
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s3, s0
; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
@@ -3121,11 +3121,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp
;
; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split1:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2048 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3135,11 +3135,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp
;
; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_split1:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:2048 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3156,11 +3156,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_split1(ptr addrsp
define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_split0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX9-NEXT: s_addc_u32 s1, s1, 2
+; GFX9-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX9-NEXT: s_addc_u32 s1, s3, 2
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -3168,10 +3168,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp
;
; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split0:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX10-GISEL-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX10-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
@@ -3181,10 +3181,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp
;
; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split0:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc
@@ -3196,10 +3196,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split0:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfff
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0xfff
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 2
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off scope:SCOPE_SYS
@@ -3211,10 +3211,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp
;
; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split0:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x800, s0
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x800, s2
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s3, s0
; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 glc dlc
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
@@ -3222,11 +3222,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp
;
; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split0:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3236,11 +3236,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp
;
; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split0:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3257,11 +3257,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split0(ptr addrsp
define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_split1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0x1000
-; GFX9-NEXT: s_addc_u32 s1, s1, 2
+; GFX9-NEXT: s_add_u32 s0, s2, 0x1000
+; GFX9-NEXT: s_addc_u32 s1, s3, 2
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -3269,10 +3269,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp
;
; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split1:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0x1000
-; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX10-GISEL-NEXT: s_add_u32 s0, s2, 0x1000
+; GFX10-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
@@ -3282,10 +3282,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp
;
; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split1:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x1000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc
@@ -3297,10 +3297,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_split1:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1000
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x1000
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 2
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off scope:SCOPE_SYS
@@ -3312,10 +3312,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp
;
; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split1:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s2
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s3, s0
; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
@@ -3323,11 +3323,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp
;
; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split1:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3337,11 +3337,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp
;
; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_split1:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4096 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3358,11 +3358,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_split1(ptr addrsp
define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_split0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX9-NEXT: s_addc_u32 s1, s1, 2
+; GFX9-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX9-NEXT: s_addc_u32 s1, s3, 2
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -3370,10 +3370,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp
;
; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split0:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX10-GISEL-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX10-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
@@ -3383,10 +3383,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp
;
; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split0:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc
@@ -3398,10 +3398,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split0:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1fff
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x1fff
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 2
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off scope:SCOPE_SYS
@@ -3413,10 +3413,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp
;
; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split0:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1800, s0
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1800, s2
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s3, s0
; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 glc dlc
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
@@ -3424,11 +3424,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp
;
; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split0:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x1000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:4095 glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3438,11 +3438,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp
;
; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split0:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8191 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3459,11 +3459,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split0(ptr addrsp
define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_split1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0x2000
-; GFX9-NEXT: s_addc_u32 s1, s1, 2
+; GFX9-NEXT: s_add_u32 s0, s2, 0x2000
+; GFX9-NEXT: s_addc_u32 s1, s3, 2
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -3471,10 +3471,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp
;
; GFX10-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split1:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 0x2000
-; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX10-GISEL-NEXT: s_add_u32 s0, s2, 0x2000
+; GFX10-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
@@ -3484,10 +3484,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp
;
; GFX11-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split1:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_add_u32 s0, s0, 0x2000
-; GFX11-GISEL-NEXT: s_addc_u32 s1, s1, 2
+; GFX11-GISEL-NEXT: s_add_u32 s0, s2, 0x2000
+; GFX11-GISEL-NEXT: s_addc_u32 s1, s3, 2
; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off glc dlc
@@ -3499,10 +3499,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_split1:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x2000
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 2
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x2000
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 2
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: global_load_u8 v0, v[0:1], off scope:SCOPE_SYS
@@ -3514,10 +3514,10 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp
;
; GFX10-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split1:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x2000, s0
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s1, s0
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, s0, 0x2000, s2
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, s0, 2, s3, s0
; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off glc dlc
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: global_store_byte v[0:1], v0, off
@@ -3525,11 +3525,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp
;
; GFX11-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split1:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x2000, s0
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, s0, 0x2000, s2
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off glc dlc
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3539,11 +3539,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp
;
; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_split1:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s0
+; GFX12-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2
; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s1, s0
+; GFX12-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 2, s3, s0
; GFX12-SDAG-NEXT: global_load_u8 v0, v[0:1], off offset:8192 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3560,11 +3560,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_split1(ptr addrsp
define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX9-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX9-NEXT: s_add_u32 s0, s2, 0x7ff
+; GFX9-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -3572,11 +3572,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p
;
; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX10-NEXT: s_add_u32 s0, s2, 0x7ff
+; GFX10-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
@@ -3584,11 +3584,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p
;
; GFX11-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s0, s0, 0x7ff
-; GFX11-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-NEXT: s_add_u32 s0, s2, 0x7ff
+; GFX11-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
@@ -3598,11 +3598,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x7ff
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x7ff
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0x80000000
; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off
@@ -3612,12 +3612,12 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p
;
; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split0:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x7ff
-; GFX12-SDAG-NEXT: s_brev_b32 s3, 1
+; GFX12-SDAG-NEXT: s_movk_i32 s0, 0x7ff
+; GFX12-SDAG-NEXT: s_brev_b32 s1, 1
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3634,11 +3634,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split0(p
define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0x800
-; GFX9-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX9-NEXT: s_add_u32 s0, s2, 0x800
+; GFX9-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -3646,11 +3646,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p
;
; GFX10-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x800
-; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX10-NEXT: s_add_u32 s0, s2, 0x800
+; GFX10-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
@@ -3658,11 +3658,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p
;
; GFX11-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s0, s0, 0x800
-; GFX11-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-NEXT: s_add_u32 s0, s2, 0x800
+; GFX11-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
@@ -3672,11 +3672,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x800
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x800
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0x80000000
; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off
@@ -3686,12 +3686,12 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p
;
; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_11bit_neg_high_split1:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x800
-; GFX12-SDAG-NEXT: s_brev_b32 s3, 1
+; GFX12-SDAG-NEXT: s_movk_i32 s0, 0x800
+; GFX12-SDAG-NEXT: s_brev_b32 s1, 1
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3708,11 +3708,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_11bit_neg_high_split1(p
define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX9-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX9-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX9-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -3720,11 +3720,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p
;
; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX10-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX10-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
@@ -3732,11 +3732,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p
;
; GFX11-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s0, s0, 0xfff
-; GFX11-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-NEXT: s_add_u32 s0, s2, 0xfff
+; GFX11-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
@@ -3746,11 +3746,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0xfff
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0xfff
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0x80000000
; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off
@@ -3760,12 +3760,12 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p
;
; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split0:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT: s_movk_i32 s2, 0xfff
-; GFX12-SDAG-NEXT: s_brev_b32 s3, 1
+; GFX12-SDAG-NEXT: s_movk_i32 s0, 0xfff
+; GFX12-SDAG-NEXT: s_brev_b32 s1, 1
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3782,11 +3782,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split0(p
define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0x1000
-; GFX9-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX9-NEXT: s_add_u32 s0, s2, 0x1000
+; GFX9-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -3794,11 +3794,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p
;
; GFX10-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x1000
-; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX10-NEXT: s_add_u32 s0, s2, 0x1000
+; GFX10-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
@@ -3806,11 +3806,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p
;
; GFX11-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s0, s0, 0x1000
-; GFX11-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-NEXT: s_add_u32 s0, s2, 0x1000
+; GFX11-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
@@ -3820,11 +3820,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1000
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x1000
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0x80000000
; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off
@@ -3834,12 +3834,12 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p
;
; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_12bit_neg_high_split1:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1000
-; GFX12-SDAG-NEXT: s_brev_b32 s3, 1
+; GFX12-SDAG-NEXT: s_movk_i32 s0, 0x1000
+; GFX12-SDAG-NEXT: s_brev_b32 s1, 1
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3856,11 +3856,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_12bit_neg_high_split1(p
define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX9-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX9-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX9-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -3868,11 +3868,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p
;
; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX10-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX10-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
@@ -3880,11 +3880,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p
;
; GFX11-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s0, s0, 0x1fff
-; GFX11-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-NEXT: s_add_u32 s0, s2, 0x1fff
+; GFX11-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
@@ -3894,11 +3894,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x1fff
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x1fff
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0x80000000
; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off
@@ -3908,12 +3908,12 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p
;
; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split0:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x1fff
-; GFX12-SDAG-NEXT: s_brev_b32 s3, 1
+; GFX12-SDAG-NEXT: s_movk_i32 s0, 0x1fff
+; GFX12-SDAG-NEXT: s_brev_b32 s1, 1
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
@@ -3930,11 +3930,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split0(p
define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(ptr addrspace(1) %p) {
; GFX9-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s0, s0, 0x2000
-; GFX9-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX9-NEXT: s_add_u32 s0, s2, 0x2000
+; GFX9-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX9-NEXT: global_load_ubyte v0, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_byte v[0:1], v0, off
@@ -3942,11 +3942,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p
;
; GFX10-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 0x2000
-; GFX10-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX10-NEXT: s_add_u32 s0, s2, 0x2000
+; GFX10-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX10-NEXT: global_load_ubyte v0, v0, s[0:1] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_byte v[0:1], v0, off
@@ -3954,11 +3954,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p
;
; GFX11-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s0, s0, 0x2000
-; GFX11-NEXT: s_addc_u32 s1, s1, 0x80000000
+; GFX11-NEXT: s_add_u32 s0, s2, 0x2000
+; GFX11-NEXT: s_addc_u32 s1, s3, 0x80000000
; GFX11-NEXT: global_load_u8 v0, v0, s[0:1] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b8 v[0:1], v0, off
@@ -3968,11 +3968,11 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p
;
; GFX12-GISEL-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 0x2000
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0x80000000
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s2, 0x2000
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0x80000000
; GFX12-GISEL-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v0, off
@@ -3982,12 +3982,12 @@ define amdgpu_kernel void @global_inst_salu_offset_64bit_13bit_neg_high_split1(p
;
; GFX12-SDAG-LABEL: global_inst_salu_offset_64bit_13bit_neg_high_split1:
; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-NEXT: s_movk_i32 s2, 0x2000
-; GFX12-SDAG-NEXT: s_brev_b32 s3, 1
+; GFX12-SDAG-NEXT: s_movk_i32 s0, 0x2000
+; GFX12-SDAG-NEXT: s_brev_b32 s1, 1
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1]
; GFX12-SDAG-NEXT: global_load_u8 v0, v0, s[0:1] scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v0, off
diff --git a/llvm/test/CodeGen/AMDGPU/omod.ll b/llvm/test/CodeGen/AMDGPU/omod.ll
index 769d035858ca8..48259163c431e 100644
--- a/llvm/test/CodeGen/AMDGPU/omod.ll
+++ b/llvm/test/CodeGen/AMDGPU/omod.ll
@@ -25,15 +25,15 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(ptr addrspac
;
; VI-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -43,30 +43,30 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(ptr addrspac
;
; GFX11-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mul_f32_e32 v1, 0.5, v1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_omod_div2_f32_enable_ieee_signed_zeros:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mul_f32_e32 v1, 0.5, v1
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -101,15 +101,15 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(ptr addrspac
;
; VI-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
@@ -119,30 +119,30 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_signed_zeros(ptr addrspac
;
; GFX11-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_omod_div2_f64_enable_ieee_signed_zeros:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX12-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_f64_e32 v[0:1], 1.0, v[0:1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mul_f64_e32 v[0:1], 0.5, v[0:1]
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -177,15 +177,15 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(ptr addrspace(1) %out
;
; VI-LABEL: v_omod_div2_f32_enable_ieee_nsz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -195,30 +195,30 @@ define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(ptr addrspace(1) %out
;
; GFX11-LABEL: v_omod_div2_f32_enable_ieee_nsz:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mul_f32_e32 v1, 0.5, v1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_omod_div2_f32_enable_ieee_nsz:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mul_f32_e32 v1, 0.5, v1
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -253,15 +253,15 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_nsz(ptr addrspace(1) %out
;
; VI-LABEL: v_omod_div2_f64_enable_ieee_nsz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
@@ -271,30 +271,30 @@ define amdgpu_kernel void @v_omod_div2_f64_enable_ieee_nsz(ptr addrspace(1) %out
;
; GFX11-LABEL: v_omod_div2_f64_enable_ieee_nsz:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 1.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: v_omod_div2_f64_enable_ieee_nsz:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX12-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_add_f64_e32 v[0:1], 1.0, v[0:1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mul_f64_e32 v[0:1], 0.5, v[0:1]
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.ll b/llvm/test/CodeGen/AMDGPU/optimize-compare.ll
index bd7f9014d55ca..5b755d0a515c1 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-compare.ll
+++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.ll
@@ -62,15 +62,15 @@ define amdgpu_kernel void @if_masked_0x80000000(i32 %arg, ptr addrspace(1) %p)
define amdgpu_kernel void @if_masked_0x8000000000000000(i64 %arg, ptr addrspace(1) %p) {
; GCN-LABEL: if_masked_0x8000000000000000:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: s_mov_b32 s0, 0
; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_and_b32 s1, s1, 0x80000000
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_and_b32 s1, s5, 0x80000000
; GCN-NEXT: s_cmp_eq_u64 s[0:1], 0
; GCN-NEXT: s_cselect_b32 s0, 22, 33
; GCN-NEXT: v_mov_b32_e32 v1, s0
-; GCN-NEXT: global_store_dword v0, v1, s[2:3]
+; GCN-NEXT: global_store_dword v0, v1, s[6:7]
; GCN-NEXT: s_endpgm
%and = and i64 %arg, 9223372036854775808
%cmp = icmp eq i64 %and, 0
diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll
index 65f4a1b17a881..63e9e60ead900 100644
--- a/llvm/test/CodeGen/AMDGPU/or.ll
+++ b/llvm/test/CodeGen/AMDGPU/or.ll
@@ -25,21 +25,21 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
;
; GFX8-LABEL: or_v2i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s7, 0xf000
-; GFX8-NEXT: s_mov_b32 s6, -1
-; GFX8-NEXT: s_mov_b32 s10, s6
-; GFX8-NEXT: s_mov_b32 s11, s7
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_mov_b32 s10, s2
+; GFX8-NEXT: s_mov_b32 s11, s3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s8, s2
-; GFX8-NEXT: s_mov_b32 s9, s3
+; GFX8-NEXT: s_mov_b32 s8, s6
+; GFX8-NEXT: s_mov_b32 s9, s7
; GFX8-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GFX8-NEXT: s_mov_b32 s4, s0
-; GFX8-NEXT: s_mov_b32 s5, s1
+; GFX8-NEXT: s_mov_b32 s0, s4
+; GFX8-NEXT: s_mov_b32 s1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_or_b32_e32 v1, v1, v3
; GFX8-NEXT: v_or_b32_e32 v0, v0, v2
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: or_v2i32:
@@ -92,24 +92,24 @@ define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
;
; GFX8-LABEL: or_v4i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s7, 0xf000
-; GFX8-NEXT: s_mov_b32 s6, -1
-; GFX8-NEXT: s_mov_b32 s10, s6
-; GFX8-NEXT: s_mov_b32 s11, s7
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_mov_b32 s10, s2
+; GFX8-NEXT: s_mov_b32 s11, s3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s8, s2
-; GFX8-NEXT: s_mov_b32 s9, s3
+; GFX8-NEXT: s_mov_b32 s8, s6
+; GFX8-NEXT: s_mov_b32 s9, s7
; GFX8-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GFX8-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; GFX8-NEXT: s_mov_b32 s4, s0
-; GFX8-NEXT: s_mov_b32 s5, s1
+; GFX8-NEXT: s_mov_b32 s0, s4
+; GFX8-NEXT: s_mov_b32 s1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_or_b32_e32 v3, v3, v7
; GFX8-NEXT: v_or_b32_e32 v2, v2, v6
; GFX8-NEXT: v_or_b32_e32 v1, v1, v5
; GFX8-NEXT: v_or_b32_e32 v0, v0, v4
-; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: or_v4i32:
@@ -258,14 +258,14 @@ define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a)
;
; GFX8-LABEL: scalar_or_literal_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_or_b32 s4, s4, 0x1869f
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: s_or_b32 s0, s2, 0x1869f
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: scalar_or_literal_i32:
@@ -300,16 +300,16 @@ define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32
;
; GFX8-LABEL: scalar_or_literal_i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_or_b32 s5, s5, 0xf237b
-; GFX8-NEXT: s_or_b32 s4, s4, 0x3039
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: s_or_b32 s0, s3, 0xf237b
+; GFX8-NEXT: s_or_b32 s1, s2, 0x3039
+; GFX8-NEXT: v_mov_b32_e32 v0, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: scalar_or_literal_i64:
@@ -357,18 +357,18 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x74
-; GFX8-NEXT: s_movk_i32 s8, 0x3039
-; GFX8-NEXT: s_mov_b32 s9, 0xf237b
+; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x74
+; GFX8-NEXT: s_movk_i32 s0, 0x3039
+; GFX8-NEXT: s_mov_b32 s1, 0xf237b
; GFX8-NEXT: s_mov_b32 s7, 0xf000
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: s_mov_b32 s6, -1
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_add_u32 s0, s0, 0x3039
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_add_u32 s0, s8, 0x3039
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
-; GFX8-NEXT: s_addc_u32 s1, s1, 0xf237b
+; GFX8-NEXT: s_addc_u32 s1, s9, 0xf237b
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -421,15 +421,15 @@ define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x
;
; GFX8-LABEL: scalar_or_inline_imm_i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_or_b32 s4, s4, 63
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s5
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: s_or_b32 s0, s2, 63
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: scalar_or_inline_imm_i64:
@@ -534,15 +534,15 @@ define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [
;
; GFX8-LABEL: scalar_or_neg_inline_imm_i64:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dword s4, s[0:1], 0x4c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_load_dword s2, s[0:1], 0x4c
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: v_mov_b32_e32 v1, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_or_b32 s4, s4, -8
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX8-NEXT: s_or_b32 s0, s2, -8
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: scalar_or_neg_inline_imm_i64:
@@ -583,20 +583,20 @@ define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addr
;
; GFX8-LABEL: vector_or_literal_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s7, 0xf000
-; GFX8-NEXT: s_mov_b32 s6, -1
-; GFX8-NEXT: s_mov_b32 s10, s6
-; GFX8-NEXT: s_mov_b32 s11, s7
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_mov_b32 s10, s2
+; GFX8-NEXT: s_mov_b32 s11, s3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s8, s2
-; GFX8-NEXT: s_mov_b32 s9, s3
+; GFX8-NEXT: s_mov_b32 s8, s6
+; GFX8-NEXT: s_mov_b32 s9, s7
; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX8-NEXT: s_mov_b32 s4, s0
-; GFX8-NEXT: s_mov_b32 s5, s1
+; GFX8-NEXT: s_mov_b32 s0, s4
+; GFX8-NEXT: s_mov_b32 s1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_or_b32_e32 v0, 0xffff, v0
-; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: vector_or_literal_i32:
@@ -642,20 +642,20 @@ define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out,
;
; GFX8-LABEL: vector_or_inline_immediate_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s7, 0xf000
-; GFX8-NEXT: s_mov_b32 s6, -1
-; GFX8-NEXT: s_mov_b32 s10, s6
-; GFX8-NEXT: s_mov_b32 s11, s7
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_mov_b32 s10, s2
+; GFX8-NEXT: s_mov_b32 s11, s3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s8, s2
-; GFX8-NEXT: s_mov_b32 s9, s3
+; GFX8-NEXT: s_mov_b32 s8, s6
+; GFX8-NEXT: s_mov_b32 s9, s7
; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX8-NEXT: s_mov_b32 s4, s0
-; GFX8-NEXT: s_mov_b32 s5, s1
+; GFX8-NEXT: s_mov_b32 s0, s4
+; GFX8-NEXT: s_mov_b32 s1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_or_b32_e32 v0, 4, v0
-; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: vector_or_inline_immediate_i32:
@@ -886,21 +886,21 @@ define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addr
;
; GFX8-LABEL: vector_or_i64_loadimm:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s7, 0xf000
-; GFX8-NEXT: s_mov_b32 s6, -1
-; GFX8-NEXT: s_mov_b32 s10, s6
-; GFX8-NEXT: s_mov_b32 s11, s7
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_mov_b32 s10, s2
+; GFX8-NEXT: s_mov_b32 s11, s3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s8, s2
-; GFX8-NEXT: s_mov_b32 s9, s3
+; GFX8-NEXT: s_mov_b32 s8, s6
+; GFX8-NEXT: s_mov_b32 s9, s7
; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX8-NEXT: s_mov_b32 s4, s0
-; GFX8-NEXT: s_mov_b32 s5, s1
+; GFX8-NEXT: s_mov_b32 s0, s4
+; GFX8-NEXT: s_mov_b32 s1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_or_b32_e32 v1, 0x146f, v1
; GFX8-NEXT: v_or_b32_e32 v0, 0xdf77987f, v0
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: vector_or_i64_loadimm:
@@ -949,20 +949,20 @@ define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspac
;
; GFX8-LABEL: vector_or_i64_imm:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s7, 0xf000
-; GFX8-NEXT: s_mov_b32 s6, -1
-; GFX8-NEXT: s_mov_b32 s10, s6
-; GFX8-NEXT: s_mov_b32 s11, s7
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_mov_b32 s10, s2
+; GFX8-NEXT: s_mov_b32 s11, s3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s8, s2
-; GFX8-NEXT: s_mov_b32 s9, s3
+; GFX8-NEXT: s_mov_b32 s8, s6
+; GFX8-NEXT: s_mov_b32 s9, s7
; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX8-NEXT: s_mov_b32 s4, s0
-; GFX8-NEXT: s_mov_b32 s5, s1
+; GFX8-NEXT: s_mov_b32 s0, s4
+; GFX8-NEXT: s_mov_b32 s1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_or_b32_e32 v0, 8, v0
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: vector_or_i64_imm:
@@ -1009,21 +1009,21 @@ define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, p
;
; GFX8-LABEL: vector_or_i64_neg_inline_imm:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s7, 0xf000
-; GFX8-NEXT: s_mov_b32 s6, -1
-; GFX8-NEXT: s_mov_b32 s10, s6
-; GFX8-NEXT: s_mov_b32 s11, s7
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_mov_b32 s10, s2
+; GFX8-NEXT: s_mov_b32 s11, s3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s8, s2
-; GFX8-NEXT: s_mov_b32 s9, s3
+; GFX8-NEXT: s_mov_b32 s8, s6
+; GFX8-NEXT: s_mov_b32 s9, s7
; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX8-NEXT: s_mov_b32 s4, s0
-; GFX8-NEXT: s_mov_b32 s5, s1
+; GFX8-NEXT: s_mov_b32 s0, s4
+; GFX8-NEXT: s_mov_b32 s1, s5
; GFX8-NEXT: v_mov_b32_e32 v1, -1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_or_b32_e32 v0, -8, v0
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: vector_or_i64_neg_inline_imm:
@@ -1072,21 +1072,21 @@ define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr
;
; GFX8-LABEL: vector_or_i64_neg_literal:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s7, 0xf000
-; GFX8-NEXT: s_mov_b32 s6, -1
-; GFX8-NEXT: s_mov_b32 s10, s6
-; GFX8-NEXT: s_mov_b32 s11, s7
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s3, 0xf000
+; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_mov_b32 s10, s2
+; GFX8-NEXT: s_mov_b32 s11, s3
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s8, s2
-; GFX8-NEXT: s_mov_b32 s9, s3
+; GFX8-NEXT: s_mov_b32 s8, s6
+; GFX8-NEXT: s_mov_b32 s9, s7
; GFX8-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX8-NEXT: s_mov_b32 s4, s0
-; GFX8-NEXT: s_mov_b32 s5, s1
+; GFX8-NEXT: s_mov_b32 s0, s4
+; GFX8-NEXT: s_mov_b32 s1, s5
; GFX8-NEXT: v_mov_b32_e32 v1, -1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_or_b32_e32 v0, 0xffffff38, v0
-; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: vector_or_i64_neg_literal:
@@ -1129,15 +1129,15 @@ define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32],
;
; GFX8-LABEL: trunc_i64_or_to_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dword s4, s[0:1], 0x4c
-; GFX8-NEXT: s_load_dword s5, s[0:1], 0x74
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_load_dword s2, s[0:1], 0x4c
+; GFX8-NEXT: s_load_dword s3, s[0:1], 0x74
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s7, 0xf000
+; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_or_b32 s4, s5, s4
-; GFX8-NEXT: v_mov_b32_e32 v0, s4
-; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT: s_or_b32 s0, s3, s2
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: trunc_i64_or_to_i32:
@@ -1261,17 +1261,17 @@ define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c
; GFX8-LABEL: s_or_i1:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX8-NEXT: s_mov_b32 s3, 0xf000
-; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; GFX8-NEXT: s_mov_b32 s11, 0xf000
+; GFX8-NEXT: s_mov_b32 s10, -1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_cmp_eq_u32 s4, s5
-; GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GFX8-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX8-NEXT: s_cmp_eq_u32 s6, s7
-; GFX8-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GFX8-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
-; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GFX8-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; GFX8-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX8-NEXT: buffer_store_byte v0, off, s[8:11], 0
; GFX8-NEXT: s_endpgm
;
; EG-LABEL: s_or_i1:
diff --git a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll
index 1899a0abc6592..40489942295cf 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-op-sel.ll
@@ -299,15 +299,15 @@ define amdgpu_kernel void @fma_vector_vector_neg_vector_hi(ptr addrspace(1) %out
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: ds_read_b32 v2, v0 offset:4
; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -330,15 +330,15 @@ define amdgpu_kernel void @fma_vector_vector_vector_neg_hi(ptr addrspace(1) %out
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: ds_read_b32 v2, v0 offset:4
; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 neg_hi:[0,0,1]
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -362,14 +362,14 @@ define amdgpu_kernel void @add_vector_scalar_hi(ptr addrspace(1) %out, ptr addrs
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v2, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: ds_read_b32 v0, v0 offset:4
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_add_u16 v0, v1, v0 op_sel:[0,1]
-; GCN-NEXT: global_store_dword v2, v0, s[0:1]
+; GCN-NEXT: global_store_dword v2, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x i16>, ptr addrspace(3) %lds, i32 1
@@ -389,15 +389,15 @@ define amdgpu_kernel void @fma_vector_vector_scalar_hi(ptr addrspace(1) %out, pt
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: ds_read_b32 v2, v0 offset:4
; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1]
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -420,15 +420,15 @@ define amdgpu_kernel void @fma_vector_vector_neg_vector_lo_neg_hi(ptr addrspace(
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: ds_read_b32 v2, v0 offset:4
; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -453,15 +453,15 @@ define amdgpu_kernel void @fma_vector_vector_swap_vector(ptr addrspace(1) %out,
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: ds_read_b32 v2, v0 offset:4
; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0]
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -483,15 +483,15 @@ define amdgpu_kernel void @fma_vector_vector_swap_neg_vector(ptr addrspace(1) %o
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: ds_read_b32 v2, v0 offset:4
; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1] neg_hi:[0,0,1]
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -514,15 +514,15 @@ define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_0(ptr addrs
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: ds_read_b32 v2, v0 offset:4
; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0] neg_lo:[0,0,1]
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -544,15 +544,15 @@ define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_1(ptr addrs
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: ds_read_b32 v2, v0 offset:4
; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 neg_lo:[0,0,1]
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -574,15 +574,15 @@ define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_2(ptr addrs
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: ds_read_b32 v2, v0 offset:4
; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 neg_hi:[0,0,1]
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -604,15 +604,15 @@ define amdgpu_kernel void @fma_vector_vector_blend_vector_neg_vector_3(ptr addrs
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v3, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: ds_read_b32 v2, v0 offset:4
; GCN-NEXT: ds_read_b32 v0, v0 offset:8
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] neg_lo:[0,0,1]
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -634,16 +634,16 @@ define amdgpu_kernel void @bitcast_fneg_f32(ptr addrspace(1) %out, ptr addrspace
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v2, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v0, v0
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
; GCN-NEXT: v_pk_add_f16 v0, v0, v1
-; GCN-NEXT: global_store_dword v2, v0, s[0:1]
+; GCN-NEXT: global_store_dword v2, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%vec0 = load volatile <2 x half>, ptr addrspace(3) %lds, align 4
@@ -661,16 +661,16 @@ define amdgpu_kernel void @shuffle_bitcast_fneg_f32(ptr addrspace(1) %out, ptr a
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v2, 0
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v0, v0
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
; GCN-NEXT: v_pk_add_f16 v0, v0, v1 op_sel:[0,1] op_sel_hi:[1,0]
-; GCN-NEXT: global_store_dword v2, v0, s[0:1]
+; GCN-NEXT: global_store_dword v2, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%vec0 = load volatile <2 x half>, ptr addrspace(3) %lds, align 4
@@ -689,18 +689,18 @@ define amdgpu_kernel void @extract_from_i64(ptr addrspace(1) %out, ptr addrspace
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_dword s2, s[0:1], 0x2c
; GCN-NEXT: v_mov_b32_e32 v3, 0xffff
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: ds_read_b32 v2, v0
; GCN-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc
; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: v_and_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GCN-NEXT: v_lshl_or_b32 v0, v0, 16, v3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_pk_add_u16 v0, v2, v0
-; GCN-NEXT: global_store_dword v1, v0, s[0:1]
+; GCN-NEXT: global_store_dword v1, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%vec0 = load volatile <2 x i16>, ptr addrspace(3) %lds, align 4
@@ -726,14 +726,14 @@ define amdgpu_kernel void @bitcast_lo_elt_op_sel(ptr addrspace(1) %out, ptr addr
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: ds_read_b32 v2, v0 offset:4
; GCN-NEXT: ds_read_b32 v0, v0 offset:8
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: global_load_ushort v3, v[0:1], off glc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_pk_add_f16 v0, v0, 2.0 op_sel_hi:[1,0]
; GCN-NEXT: v_mov_b32_e32 v3, 0
; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0]
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
@@ -764,7 +764,7 @@ define amdgpu_kernel void @mix_elt_types_op_sel(ptr addrspace(1) %out, ptr addrs
; GCN-NEXT: ds_read_b32 v1, v0
; GCN-NEXT: ds_read_b32 v2, v0 offset:4
; GCN-NEXT: ds_read_b32 v0, v0 offset:8
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: ; kill: killed $vgpr0_vgpr1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: global_load_ushort v3, v[0:1], off glc
@@ -776,7 +776,7 @@ define amdgpu_kernel void @mix_elt_types_op_sel(ptr addrspace(1) %out, ptr addrs
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v3, 0
; GCN-NEXT: v_pk_fma_f16 v0, v1, v2, v0 op_sel:[0,0,1] op_sel_hi:[1,1,0]
-; GCN-NEXT: global_store_dword v3, v0, s[0:1]
+; GCN-NEXT: global_store_dword v3, v0, s[2:3]
; GCN-NEXT: s_endpgm
bb:
%lds.gep1 = getelementptr inbounds <2 x half>, ptr addrspace(3) %lds, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll b/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll
index 4794c29621525..8333386a80722 100644
--- a/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll
+++ b/llvm/test/CodeGen/AMDGPU/post-ra-soft-clause-dbg-info.ll
@@ -8,16 +8,16 @@
define amdgpu_kernel void @dbg_clause(ptr addrspace(1) %out, ptr addrspace(1) %aptr) !dbg !4 {
; GCN-LABEL: dbg_clause:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_dword v1, v0, s[2:3]
+; GCN-NEXT: global_load_dword v1, v0, s[6:7]
; GCN-NEXT: ;DEBUG_VALUE: foo:a <- $vgpr1
-; GCN-NEXT: global_load_dword v2, v0, s[2:3] offset:32
+; GCN-NEXT: global_load_dword v2, v0, s[6:7] offset:32
; GCN-NEXT: ;DEBUG_VALUE: foo:b <- $vgpr2
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_add_f32_e32 v1, v1, v2
-; GCN-NEXT: global_store_dword v0, v1, s[0:1]
+; GCN-NEXT: global_store_dword v0, v1, s[4:5]
; GCN-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
index 3f8b64b618e9d..0747760806dc6 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
@@ -519,11 +519,11 @@ define amdgpu_kernel void @byref_kernel_preload_arg(ptr addrspace(1) %out, ptr a
; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-2-NEXT: ; %bb.0:
-; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x100
+; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x100
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
; GFX940-PRELOAD-2-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, s5
; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-PRELOAD-2-NEXT: s_waitcnt vmcnt(0)
; GFX940-PRELOAD-2-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1
@@ -534,11 +534,11 @@ define amdgpu_kernel void @byref_kernel_preload_arg(ptr addrspace(1) %out, ptr a
; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-8-NEXT: ; %bb.0:
-; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x100
+; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x100
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
; GFX940-PRELOAD-8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s0
-; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s1
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, s5
; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-PRELOAD-8-NEXT: s_waitcnt vmcnt(0)
; GFX940-PRELOAD-8-NEXT: global_store_dword v0, v2, s[2:3] sc0 sc1
diff --git a/llvm/test/CodeGen/AMDGPU/promote-vect3-load.ll b/llvm/test/CodeGen/AMDGPU/promote-vect3-load.ll
index dabb9d43bf3d6..0c8dbd1db8ec6 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-vect3-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-vect3-load.ll
@@ -21,7 +21,7 @@ define protected amdgpu_kernel void @load_v3i32_align8(ptr addrspace(1) %arg) #0
; GCN-LABEL: load_v3i32_align8:
; GCN: ; %bb.0:
; GCN: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x0
+; GCN-NEXT: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[4:5], 0x0
%vec = load <3 x i32>, ptr addrspace(1) %arg, align 8
store <3 x i32> %vec, ptr addrspace(1) undef, align 8
ret void
@@ -52,7 +52,7 @@ define protected amdgpu_kernel void @load_v3f32_align8(ptr addrspace(1) %arg) #0
; GCN-LABEL: load_v3f32_align8:
; GCN: ; %bb.0:
; GCN: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x0
+; GCN-NEXT: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[4:5], 0x0
%vec = load <3 x float>, ptr addrspace(1) %arg, align 8
store <3 x float> %vec, ptr addrspace(1) undef, align 8
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll
index 2ce0b9eed02cb..a82f301c56997 100644
--- a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll
@@ -110,46 +110,46 @@ define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr
define amdgpu_kernel void @buffers_might_alias(ptr addrspace(8) %a, ptr addrspace(8) %b) {
; SDAG-LABEL: buffers_might_alias:
; SDAG: ; %bb.0:
-; SDAG-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: buffer_load_dword v0, off, s[0:3], 0
+; SDAG-NEXT: buffer_load_dword v0, off, s[4:7], 0
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0
-; SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; SDAG-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4
+; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; SDAG-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:4
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0
-; SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4
-; SDAG-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8
+; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4
+; SDAG-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:8
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0
-; SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8
-; SDAG-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:12
+; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:8
+; SDAG-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:12
; SDAG-NEXT: s_waitcnt vmcnt(0)
; SDAG-NEXT: v_mul_f32_e32 v0, v0, v0
-; SDAG-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:12
+; SDAG-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:12
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: buffers_might_alias:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: buffer_load_dword v0, off, s[0:3], 0
+; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0
-; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; GISEL-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4
+; GISEL-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:4
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0
-; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:4
-; GISEL-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8
+; GISEL-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:4
+; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:8
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0
-; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8
-; GISEL-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:12
+; GISEL-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:8
+; GISEL-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:12
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_mul_f32_e32 v0, v0, v0
-; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:12
+; GISEL-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:12
; GISEL-NEXT: s_endpgm
%l0 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %a, i32 0, i32 0, i32 0)
%s0 = fmul float %l0, %l0
@@ -173,28 +173,28 @@ define amdgpu_kernel void @buffers_might_alias(ptr addrspace(8) %a, ptr addrspac
define amdgpu_kernel void @independent_offsets(ptr addrspace(8) %a) {
; SDAG-LABEL: independent_offsets:
; SDAG: ; %bb.0:
-; SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SDAG-NEXT: v_mov_b32_e32 v2, 1.0
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
+; SDAG-NEXT: buffer_load_dword v1, v0, s[4:7], 0 offen offset:4
; SDAG-NEXT: s_nop 0
-; SDAG-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; SDAG-NEXT: buffer_store_dword v2, v0, s[4:7], 0 offen
; SDAG-NEXT: s_waitcnt vmcnt(1)
-; SDAG-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8
+; SDAG-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen offset:8
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: independent_offsets:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GISEL-NEXT: v_mov_b32_e32 v2, 1.0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen offset:4
+; GISEL-NEXT: buffer_load_dword v1, v0, s[4:7], 0 offen offset:4
; GISEL-NEXT: s_nop 0
-; GISEL-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GISEL-NEXT: buffer_store_dword v2, v0, s[4:7], 0 offen
; GISEL-NEXT: s_waitcnt vmcnt(1)
-; GISEL-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:8
+; GISEL-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen offset:8
; GISEL-NEXT: s_endpgm
%lane = call i32 @llvm.amdgcn.workitem.id.x()
%idx = shl i32 %lane, 2
diff --git a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
index 74bad5ea3edce..5be6082cbd19d 100644
--- a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
+++ b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
@@ -759,12 +759,12 @@ define amdgpu_kernel void @s_rcp_pat_f32_daz(ptr addrspace(1) %out, float %src)
;
; VI-LABEL: s_rcp_pat_f32_daz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rcp_f32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_rcp_f32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -811,12 +811,12 @@ define amdgpu_kernel void @s_rcp_ulp25_pat_f32_daz(ptr addrspace(1) %out, float
;
; VI-LABEL: s_rcp_ulp25_pat_f32_daz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rcp_f32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_rcp_f32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -863,12 +863,12 @@ define amdgpu_kernel void @s_rcp_fast_ulp25_pat_f32_daz(ptr addrspace(1) %out, f
;
; VI-LABEL: s_rcp_fast_ulp25_pat_f32_daz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rcp_f32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_rcp_f32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -915,12 +915,12 @@ define amdgpu_kernel void @s_rcp_arcp_ulp25_pat_f32_daz(ptr addrspace(1) %out, f
;
; VI-LABEL: s_rcp_arcp_ulp25_pat_f32_daz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rcp_f32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_rcp_f32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -967,12 +967,12 @@ define amdgpu_kernel void @s_rcp_global_fast_ulp25_pat_f32_daz(ptr addrspace(1)
;
; VI-LABEL: s_rcp_global_fast_ulp25_pat_f32_daz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rcp_f32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_rcp_f32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1019,12 +1019,12 @@ define amdgpu_kernel void @s_rcp_fabs_pat_f32_daz(ptr addrspace(1) %out, float %
;
; VI-LABEL: s_rcp_fabs_pat_f32_daz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rcp_f32_e64 v2, |s2|
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_rcp_f32_e64 v2, |s4|
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1072,12 +1072,12 @@ define amdgpu_kernel void @s_neg_rcp_pat_f32_daz(ptr addrspace(1) %out, float %s
;
; VI-LABEL: s_neg_rcp_pat_f32_daz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rcp_f32_e64 v2, -s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_rcp_f32_e64 v2, -s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1127,12 +1127,12 @@ define amdgpu_kernel void @s_rcp_fabs_fneg_pat_f32_daz(ptr addrspace(1) %out, fl
;
; VI-LABEL: s_rcp_fabs_fneg_pat_f32_daz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rcp_f32_e64 v2, -|s2|
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_rcp_f32_e64 v2, -|s4|
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1188,13 +1188,13 @@ define amdgpu_kernel void @s_rcp_fabs_fneg_pat_multi_use_f32_daz(ptr addrspace(1
;
; VI-LABEL: s_rcp_fabs_fneg_pat_multi_use_f32_daz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_rcp_f32_e64 v2, -|s2|
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mul_f32_e64 v3, s2, -|s2|
+; VI-NEXT: v_rcp_f32_e64 v2, -|s4|
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mul_f32_e64 v3, s4, -|s4|
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dword v[0:1], v3
@@ -1254,12 +1254,12 @@ define amdgpu_kernel void @s_div_arcp_2_x_pat_f32_daz(ptr addrspace(1) %out) #0
;
; VI-LABEL: s_div_arcp_2_x_pat_f32_daz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x0
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x0
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mul_f32_e64 v2, s2, 0.5
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mul_f32_e64 v2, s4, 0.5
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1309,13 +1309,13 @@ define amdgpu_kernel void @s_div_arcp_k_x_pat_f32_daz(ptr addrspace(1) %out) #0
;
; VI-LABEL: s_div_arcp_k_x_pat_f32_daz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x0
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x0
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x3dcccccd
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mul_f32_e32 v2, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mul_f32_e32 v2, s4, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1366,13 +1366,13 @@ define amdgpu_kernel void @s_div_arcp_neg_k_x_pat_f32_daz(ptr addrspace(1) %out)
;
; VI-LABEL: s_div_arcp_neg_k_x_pat_f32_daz:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x0
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x0
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0xbdcccccd
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mul_f32_e32 v2, s2, v0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mul_f32_e32 v2, s4, v0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll
index 4a004731c42f9..9494b3c4675ec 100644
--- a/llvm/test/CodeGen/AMDGPU/rotl.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotl.ll
@@ -112,16 +112,16 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX8-LABEL: rotl_v2i32:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_sub_i32 s2, 32, s6
-; GFX8-NEXT: s_sub_i32 s3, 32, s7
-; GFX8-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: s_sub_i32 s0, 32, s6
+; GFX8-NEXT: s_sub_i32 s1, 32, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: v_alignbit_b32 v1, s5, s5, v0
; GFX8-NEXT: v_alignbit_b32 v0, s4, s4, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
@@ -143,14 +143,14 @@ define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sub_i32 s2, 32, s7
-; GFX11-NEXT: s_sub_i32 s3, 32, s6
-; GFX11-NEXT: v_alignbit_b32 v1, s5, s5, s2
-; GFX11-NEXT: v_alignbit_b32 v0, s4, s4, s3
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: s_sub_i32 s0, 32, s7
+; GFX11-NEXT: s_sub_i32 s1, 32, s6
+; GFX11-NEXT: v_alignbit_b32 v1, s5, s5, s0
+; GFX11-NEXT: v_alignbit_b32 v0, s4, s4, s1
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -211,22 +211,22 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX8-LABEL: rotl_v4i32:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_sub_i32 s3, 32, s9
+; GFX8-NEXT: s_sub_i32 s1, 32, s9
; GFX8-NEXT: s_sub_i32 s9, 32, s11
-; GFX8-NEXT: s_sub_i32 s2, 32, s8
+; GFX8-NEXT: s_sub_i32 s0, 32, s8
; GFX8-NEXT: s_sub_i32 s8, 32, s10
; GFX8-NEXT: v_mov_b32_e32 v0, s9
; GFX8-NEXT: v_alignbit_b32 v3, s7, s7, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s8
; GFX8-NEXT: v_alignbit_b32 v2, s6, s6, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s1
; GFX8-NEXT: v_alignbit_b32 v1, s5, s5, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: v_alignbit_b32 v0, s4, s4, v0
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
;
@@ -252,18 +252,18 @@ define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_sub_i32 s2, 32, s8
-; GFX11-NEXT: s_sub_i32 s3, 32, s9
+; GFX11-NEXT: s_sub_i32 s0, 32, s8
+; GFX11-NEXT: s_sub_i32 s1, 32, s9
; GFX11-NEXT: s_sub_i32 s8, 32, s11
; GFX11-NEXT: s_sub_i32 s9, 32, s10
; GFX11-NEXT: v_alignbit_b32 v3, s7, s7, s8
; GFX11-NEXT: v_alignbit_b32 v2, s6, s6, s9
-; GFX11-NEXT: v_alignbit_b32 v1, s5, s5, s3
-; GFX11-NEXT: v_alignbit_b32 v0, s4, s4, s2
-; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT: v_alignbit_b32 v1, s5, s5, s1
+; GFX11-NEXT: v_alignbit_b32 v0, s4, s4, s0
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll
index d6431d731a1f7..f9da328d4851b 100644
--- a/llvm/test/CodeGen/AMDGPU/rotr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotr.ll
@@ -99,14 +99,14 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX8-LABEL: rotr_v2i32:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s7
; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: v_alignbit_b32 v1, s5, s5, v0
; GFX8-NEXT: v_alignbit_b32 v0, s4, s4, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
@@ -126,12 +126,12 @@ define amdgpu_kernel void @rotr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_alignbit_b32 v1, s5, s5, s7
; GFX11-NEXT: v_alignbit_b32 v0, s4, s4, s6
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -180,7 +180,7 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX8-LABEL: rotr_v4i32:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s11
; GFX8-NEXT: v_mov_b32_e32 v1, s10
@@ -189,9 +189,9 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX8-NEXT: v_alignbit_b32 v2, s6, s6, v1
; GFX8-NEXT: v_alignbit_b32 v1, s5, s5, v4
; GFX8-NEXT: v_mov_b32_e32 v0, s8
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
; GFX8-NEXT: v_alignbit_b32 v0, s4, s4, v0
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
;
@@ -213,14 +213,14 @@ define amdgpu_kernel void @rotr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v4, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_alignbit_b32 v3, s7, s7, s11
; GFX11-NEXT: v_alignbit_b32 v2, s6, s6, s10
; GFX11-NEXT: v_alignbit_b32 v1, s5, s5, s9
; GFX11-NEXT: v_alignbit_b32 v0, s4, s4, s8
-; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b128 v4, v[0:3], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll
index bd3c422b52efc..acacf769ac5df 100644
--- a/llvm/test/CodeGen/AMDGPU/saddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddo.ll
@@ -39,18 +39,18 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
; VI-LABEL: saddo_i64_zext:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s6
-; VI-NEXT: s_add_u32 s2, s6, s0
+; VI-NEXT: s_add_u32 s0, s6, s2
; VI-NEXT: v_mov_b32_e32 v2, s7
-; VI-NEXT: s_addc_u32 s3, s7, s1
-; VI-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], 0
-; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2]
-; VI-NEXT: v_mov_b32_e32 v3, s3
-; VI-NEXT: s_xor_b64 s[0:1], s[8:9], vcc
-; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
+; VI-NEXT: s_addc_u32 s1, s7, s3
+; VI-NEXT: v_cmp_lt_i64_e64 s[8:9], s[2:3], 0
+; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[1:2]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_xor_b64 s[2:3], s[8:9], vcc
+; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3]
+; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -99,19 +99,19 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s2, s6, s0
-; GFX11-NEXT: s_addc_u32 s3, s7, s1
-; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0
-; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], s[6:7]
+; GFX11-NEXT: s_add_u32 s0, s6, s2
+; GFX11-NEXT: s_addc_u32 s1, s7, s3
+; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], 0
+; GFX11-NEXT: v_cmp_lt_i64_e64 s3, s[0:1], s[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s0, s0, s1
-; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT: s_xor_b32 s2, s2, s3
+; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_u32 v0, s0, s2, v0
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0
+; GFX11-NEXT: v_add_co_u32 v0, s0, s0, v0
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -155,17 +155,17 @@ define amdgpu_kernel void @s_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; VI-LABEL: s_saddo_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: s_add_i32 s4, s0, s1
-; VI-NEXT: s_cmp_lt_i32 s1, 0
-; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
-; VI-NEXT: s_cmp_lt_i32 s4, s0
+; VI-NEXT: s_add_i32 s4, s2, s3
+; VI-NEXT: s_cmp_lt_i32 s3, 0
; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT: s_cmp_lt_i32 s4, s2
+; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
+; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dword v[0:1], v4
@@ -208,18 +208,18 @@ define amdgpu_kernel void @s_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-LABEL: s_saddo_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_i32 v0, s4, s5 clamp
-; GFX11-NEXT: s_add_i32 s4, s4, s5
+; GFX11-NEXT: v_add_nc_i32 v0, s2, s3 clamp
+; GFX11-NEXT: s_add_i32 s0, s2, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s4
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, s4, v0
+; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, s0, v0
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b32 v1, v2, s[0:1]
-; GFX11-NEXT: global_store_b8 v1, v0, s[2:3]
+; GFX11-NEXT: global_store_b32 v1, v2, s[4:5]
+; GFX11-NEXT: global_store_b8 v1, v0, s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -264,18 +264,18 @@ define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; VI-LABEL: v_saddo_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: flat_load_dword v4, v[0:1]
+; VI-NEXT: flat_load_dword v5, v[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: flat_load_dword v5, v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v6, vcc, v4, v5
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v5
@@ -288,45 +288,45 @@ define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: v_saddo_i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
-; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX9-NEXT: global_load_dword v1, v0, s[8:9]
+; GFX9-NEXT: global_load_dword v2, v0, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_i32 v3, v1, v2 clamp
; GFX9-NEXT: v_add_u32_e32 v1, v1, v2
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v1, v3
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
+; GFX9-NEXT: global_store_byte v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_saddo_i32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dword v1, v0, s[4:5]
-; GFX10-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX10-NEXT: global_load_dword v1, v0, s[8:9]
+; GFX10-NEXT: global_load_dword v2, v0, s[10:11]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_nc_i32 v3, v1, v2 clamp
; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v2
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v3
; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX10-NEXT: global_store_byte v0, v2, s[2:3]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX10-NEXT: global_store_byte v0, v2, s[6:7]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_saddo_i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b32 v1, v0, s[4:5]
-; GFX11-NEXT: global_load_b32 v2, v0, s[6:7]
+; GFX11-NEXT: global_load_b32 v1, v0, s[8:9]
+; GFX11-NEXT: global_load_b32 v2, v0, s[10:11]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_nc_i32 v3, v1, v2 clamp
; GFX11-NEXT: v_add_nc_u32_e32 v1, v1, v2
@@ -334,8 +334,8 @@ define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v3
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX11-NEXT: global_store_b8 v0, v2, s[2:3]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX11-NEXT: global_store_b8 v0, v2, s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -379,21 +379,21 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
;
; VI-LABEL: s_saddo_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_add_u32 s0, s4, s6
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_addc_u32 s1, s5, s7
-; VI-NEXT: v_mov_b32_e32 v5, s5
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: s_add_u32 s0, s8, s10
+; VI-NEXT: v_mov_b32_e32 v4, s8
+; VI-NEXT: s_addc_u32 s1, s9, s11
+; VI-NEXT: v_mov_b32_e32 v5, s9
; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5]
-; VI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
+; VI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[10:11], 0
; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; VI-NEXT: flat_store_byte v[2:3], v0
@@ -401,56 +401,56 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: s_saddo_i64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s8, s4, s6
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_addc_u32 s9, s5, s7
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[10:11], s[6:7], 0
-; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
+; GFX9-NEXT: s_add_u32 s0, s8, s10
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_mov_b32_e32 v1, s9
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX9-NEXT: s_xor_b64 s[0:1], s[10:11], vcc
+; GFX9-NEXT: s_addc_u32 s1, s9, s11
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[10:11], 0
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], vcc
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; GFX9-NEXT: global_store_byte v2, v0, s[2:3]
+; GFX9-NEXT: global_store_byte v2, v0, s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: s_saddo_i64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s8, s4, s6
-; GFX10-NEXT: s_addc_u32 s9, s5, s7
-; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[6:7], 0
-; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v0, s8
-; GFX10-NEXT: v_mov_b32_e32 v1, s9
-; GFX10-NEXT: s_xor_b32 s4, s6, s4
-; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX10-NEXT: global_store_byte v2, v3, s[2:3]
+; GFX10-NEXT: s_add_u32 s0, s8, s10
+; GFX10-NEXT: s_addc_u32 s1, s9, s11
+; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[10:11], 0
+; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[0:1], s[8:9]
+; GFX10-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-NEXT: s_xor_b32 s0, s2, s3
+; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-NEXT: global_store_byte v2, v3, s[6:7]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_saddo_i64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_add_u32 s8, s4, s6
-; GFX11-NEXT: s_addc_u32 s9, s5, s7
-; GFX11-NEXT: v_cmp_lt_i64_e64 s6, s[6:7], 0
-; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[4:5]
-; GFX11-NEXT: v_mov_b32_e32 v0, s8
-; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9
+; GFX11-NEXT: s_add_u32 s0, s8, s10
+; GFX11-NEXT: s_addc_u32 s1, s9, s11
+; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[10:11], 0
+; GFX11-NEXT: v_cmp_lt_i64_e64 s3, s[0:1], s[8:9]
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_xor_b32 s4, s6, s4
-; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX11-NEXT: s_xor_b32 s0, s2, s3
+; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
-; GFX11-NEXT: global_store_b8 v2, v3, s[2:3]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT: global_store_b8 v2, v3, s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -496,18 +496,18 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
;
; VI-LABEL: v_saddo_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v6, s2
-; VI-NEXT: v_mov_b32_e32 v7, s3
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v6, s6
+; VI-NEXT: v_mov_b32_e32 v7, s7
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v8, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v9, vcc, v1, v3, vcc
@@ -627,18 +627,18 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
;
; VI-LABEL: v_saddo_v2i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v6, s2
-; VI-NEXT: v_mov_b32_e32 v7, s3
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v6, s6
+; VI-NEXT: v_mov_b32_e32 v7, s7
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v9, vcc, v1, v3
; VI-NEXT: v_add_u32_e32 v8, vcc, v0, v2
@@ -656,11 +656,11 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX9-LABEL: v_saddo_v2i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[4:5]
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[6:7]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[8:9]
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u32_e32 v5, v1, v3
; GFX9-NEXT: v_add_i32 v1, v1, v3 clamp
@@ -670,18 +670,18 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v4, v0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[0:1]
-; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[4:5]
+; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_saddo_v2i32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v5, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v5, s[4:5]
-; GFX10-NEXT: global_load_dwordx2 v[2:3], v5, s[6:7]
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v5, s[8:9]
+; GFX10-NEXT: global_load_dwordx2 v[2:3], v5, s[10:11]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_nc_u32_e32 v4, v1, v3
; GFX10-NEXT: v_add_nc_i32 v1, v1, v3 clamp
@@ -691,18 +691,18 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v0
; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX10-NEXT: global_store_dwordx2 v5, v[3:4], s[0:1]
-; GFX10-NEXT: global_store_dwordx2 v5, v[0:1], s[2:3]
+; GFX10-NEXT: global_store_dwordx2 v5, v[3:4], s[4:5]
+; GFX10-NEXT: global_store_dwordx2 v5, v[0:1], s[6:7]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_saddo_v2i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v5, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b64 v[0:1], v5, s[4:5]
-; GFX11-NEXT: global_load_b64 v[2:3], v5, s[6:7]
+; GFX11-NEXT: global_load_b64 v[0:1], v5, s[8:9]
+; GFX11-NEXT: global_load_b64 v[2:3], v5, s[10:11]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_nc_u32_e32 v4, v1, v3
; GFX11-NEXT: v_add_nc_i32 v1, v1, v3 clamp
@@ -714,8 +714,8 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v0
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_store_b64 v5, v[3:4], s[0:1]
-; GFX11-NEXT: global_store_b64 v5, v[0:1], s[2:3]
+; GFX11-NEXT: global_store_b64 v5, v[3:4], s[4:5]
+; GFX11-NEXT: global_store_b64 v5, v[0:1], s[6:7]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
index 5260a4847f70d..ae1b1915a46af 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
@@ -26,22 +26,22 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr add
;
; VI-LABEL: scalar_to_vector_v2i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16
; VI-NEXT: v_mov_b32_e32 v1, v0
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
%tmp1 = load i32, ptr addrspace(1) %in, align 4
%bc = bitcast i32 %tmp1 to <2 x i16>
@@ -73,22 +73,22 @@ define amdgpu_kernel void @scalar_to_vector_v2f32(ptr addrspace(1) %out, ptr add
;
; VI-LABEL: scalar_to_vector_v2f32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16
; VI-NEXT: v_mov_b32_e32 v1, v0
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
%tmp1 = load float, ptr addrspace(1) %in, align 4
%bc = bitcast float %tmp1 to <2 x i16>
@@ -230,13 +230,13 @@ define amdgpu_kernel void @scalar_to_vector_test6(ptr addrspace(1) %out, i8 zero
;
; VI-LABEL: scalar_to_vector_test6:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%newvec0 = insertelement <4 x i8> undef, i8 %val, i32 0
%bc = bitcast <4 x i8> %newvec0 to <2 x half>
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll
index 6372d74161fad..ef8e194caceee 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll
@@ -220,44 +220,44 @@ define amdgpu_kernel void @sdiv_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; TONGA-LABEL: sdiv_i32_4:
; TONGA: ; %bb.0:
-; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; TONGA-NEXT: s_mov_b32 s7, 0xf000
-; TONGA-NEXT: s_mov_b32 s6, -1
-; TONGA-NEXT: s_mov_b32 s10, s6
-; TONGA-NEXT: s_mov_b32 s11, s7
+; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; TONGA-NEXT: s_mov_b32 s3, 0xf000
+; TONGA-NEXT: s_mov_b32 s2, -1
+; TONGA-NEXT: s_mov_b32 s10, s2
+; TONGA-NEXT: s_mov_b32 s11, s3
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
-; TONGA-NEXT: s_mov_b32 s8, s2
-; TONGA-NEXT: s_mov_b32 s9, s3
+; TONGA-NEXT: s_mov_b32 s8, s6
+; TONGA-NEXT: s_mov_b32 s9, s7
; TONGA-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; TONGA-NEXT: s_mov_b32 s4, s0
-; TONGA-NEXT: s_mov_b32 s5, s1
+; TONGA-NEXT: s_mov_b32 s0, s4
+; TONGA-NEXT: s_mov_b32 s1, s5
; TONGA-NEXT: s_waitcnt vmcnt(0)
; TONGA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; TONGA-NEXT: v_lshrrev_b32_e32 v1, 30, v1
; TONGA-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; TONGA-NEXT: v_ashrrev_i32_e32 v0, 2, v0
-; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0
; TONGA-NEXT: s_endpgm
;
; GFX9-LABEL: sdiv_i32_4:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 30, v1
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 2, v0
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: sdiv_i32_4:
@@ -316,48 +316,48 @@ define amdgpu_kernel void @slow_sdiv_i32_3435(ptr addrspace(1) %out, ptr addrspa
;
; TONGA-LABEL: slow_sdiv_i32_3435:
; TONGA: ; %bb.0:
-; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; TONGA-NEXT: s_mov_b32 s7, 0xf000
-; TONGA-NEXT: s_mov_b32 s6, -1
-; TONGA-NEXT: s_mov_b32 s10, s6
-; TONGA-NEXT: s_mov_b32 s11, s7
+; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; TONGA-NEXT: s_mov_b32 s3, 0xf000
+; TONGA-NEXT: s_mov_b32 s2, -1
+; TONGA-NEXT: s_mov_b32 s10, s2
+; TONGA-NEXT: s_mov_b32 s11, s3
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
-; TONGA-NEXT: s_mov_b32 s8, s2
-; TONGA-NEXT: s_mov_b32 s9, s3
+; TONGA-NEXT: s_mov_b32 s8, s6
+; TONGA-NEXT: s_mov_b32 s9, s7
; TONGA-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; TONGA-NEXT: s_mov_b32 s2, 0x98a1930b
-; TONGA-NEXT: s_mov_b32 s4, s0
-; TONGA-NEXT: s_mov_b32 s5, s1
+; TONGA-NEXT: s_mov_b32 s0, 0x98a1930b
+; TONGA-NEXT: s_mov_b32 s1, s5
; TONGA-NEXT: s_waitcnt vmcnt(0)
-; TONGA-NEXT: v_mul_hi_i32 v1, v0, s2
+; TONGA-NEXT: v_mul_hi_i32 v1, v0, s0
+; TONGA-NEXT: s_mov_b32 s0, s4
; TONGA-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; TONGA-NEXT: v_lshrrev_b32_e32 v1, 31, v0
; TONGA-NEXT: v_ashrrev_i32_e32 v0, 11, v0
; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1
-; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0
; TONGA-NEXT: s_endpgm
;
; GFX9-LABEL: slow_sdiv_i32_3435:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s2, 0x98a1930b
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, 0x98a1930b
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mul_hi_i32 v1, v0, s2
+; GFX9-NEXT: v_mul_hi_i32 v1, v0, s0
+; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: v_add_u32_e32 v0, v1, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 31, v0
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 11, v0
; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: slow_sdiv_i32_3435:
@@ -462,17 +462,17 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; TONGA-LABEL: sdiv_v2i32:
; TONGA: ; %bb.0:
-; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; TONGA-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24
; TONGA-NEXT: s_mov_b32 s7, 0xf000
; TONGA-NEXT: s_mov_b32 s6, -1
-; TONGA-NEXT: s_mov_b32 s10, s6
-; TONGA-NEXT: s_mov_b32 s11, s7
+; TONGA-NEXT: s_mov_b32 s2, s6
+; TONGA-NEXT: s_mov_b32 s3, s7
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
-; TONGA-NEXT: s_mov_b32 s8, s2
-; TONGA-NEXT: s_mov_b32 s9, s3
-; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; TONGA-NEXT: s_mov_b32 s4, s0
-; TONGA-NEXT: s_mov_b32 s5, s1
+; TONGA-NEXT: s_mov_b32 s0, s10
+; TONGA-NEXT: s_mov_b32 s1, s11
+; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; TONGA-NEXT: s_mov_b32 s4, s8
+; TONGA-NEXT: s_mov_b32 s5, s9
; TONGA-NEXT: s_waitcnt vmcnt(0)
; TONGA-NEXT: v_sub_u32_e32 v6, vcc, 0, v2
; TONGA-NEXT: v_sub_u32_e32 v9, vcc, 0, v3
@@ -707,17 +707,17 @@ define amdgpu_kernel void @sdiv_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1)
;
; TONGA-LABEL: sdiv_v2i32_4:
; TONGA: ; %bb.0:
-; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; TONGA-NEXT: s_mov_b32 s7, 0xf000
-; TONGA-NEXT: s_mov_b32 s6, -1
-; TONGA-NEXT: s_mov_b32 s10, s6
-; TONGA-NEXT: s_mov_b32 s11, s7
+; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; TONGA-NEXT: s_mov_b32 s3, 0xf000
+; TONGA-NEXT: s_mov_b32 s2, -1
+; TONGA-NEXT: s_mov_b32 s10, s2
+; TONGA-NEXT: s_mov_b32 s11, s3
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
-; TONGA-NEXT: s_mov_b32 s8, s2
-; TONGA-NEXT: s_mov_b32 s9, s3
+; TONGA-NEXT: s_mov_b32 s8, s6
+; TONGA-NEXT: s_mov_b32 s9, s7
; TONGA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; TONGA-NEXT: s_mov_b32 s4, s0
-; TONGA-NEXT: s_mov_b32 s5, s1
+; TONGA-NEXT: s_mov_b32 s0, s4
+; TONGA-NEXT: s_mov_b32 s1, s5
; TONGA-NEXT: s_waitcnt vmcnt(0)
; TONGA-NEXT: v_ashrrev_i32_e32 v2, 31, v0
; TONGA-NEXT: v_ashrrev_i32_e32 v3, 31, v1
@@ -727,22 +727,22 @@ define amdgpu_kernel void @sdiv_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1)
; TONGA-NEXT: v_add_u32_e32 v1, vcc, v3, v1
; TONGA-NEXT: v_ashrrev_i32_e32 v0, 2, v0
; TONGA-NEXT: v_ashrrev_i32_e32 v1, 2, v1
-; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; TONGA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; TONGA-NEXT: s_endpgm
;
; GFX9-LABEL: sdiv_v2i32_4:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v0
; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v1
@@ -752,7 +752,7 @@ define amdgpu_kernel void @sdiv_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1)
; GFX9-NEXT: v_add_u32_e32 v1, v1, v3
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 2, v0
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 2, v1
-; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: sdiv_v2i32_4:
@@ -918,18 +918,18 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; TONGA-LABEL: sdiv_v4i32:
; TONGA: ; %bb.0:
-; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; TONGA-NEXT: s_mov_b32 s11, 0xf000
; TONGA-NEXT: s_mov_b32 s10, -1
-; TONGA-NEXT: s_mov_b32 s6, s10
-; TONGA-NEXT: s_mov_b32 s7, s11
+; TONGA-NEXT: s_mov_b32 s2, s10
+; TONGA-NEXT: s_mov_b32 s3, s11
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
-; TONGA-NEXT: s_mov_b32 s4, s2
-; TONGA-NEXT: s_mov_b32 s5, s3
-; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; TONGA-NEXT: s_mov_b32 s8, s0
-; TONGA-NEXT: s_mov_b32 s9, s1
+; TONGA-NEXT: s_mov_b32 s0, s6
+; TONGA-NEXT: s_mov_b32 s1, s7
+; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; TONGA-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; TONGA-NEXT: s_mov_b32 s8, s4
+; TONGA-NEXT: s_mov_b32 s9, s5
; TONGA-NEXT: s_waitcnt vmcnt(1)
; TONGA-NEXT: v_sub_u32_e32 v12, vcc, 0, v1
; TONGA-NEXT: s_waitcnt vmcnt(0)
@@ -1371,17 +1371,17 @@ define amdgpu_kernel void @sdiv_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX9-LABEL: sdiv_v4i32_4:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v0
; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v1
@@ -1399,7 +1399,7 @@ define amdgpu_kernel void @sdiv_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1)
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 2, v1
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 2, v2
; GFX9-NEXT: v_ashrrev_i32_e32 v3, 2, v3
-; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: sdiv_v4i32_4:
@@ -1482,18 +1482,18 @@ define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
;
; TONGA-LABEL: v_sdiv_i8:
; TONGA: ; %bb.0:
-; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; TONGA-NEXT: s_mov_b32 s7, 0xf000
-; TONGA-NEXT: s_mov_b32 s6, -1
-; TONGA-NEXT: s_mov_b32 s10, s6
-; TONGA-NEXT: s_mov_b32 s11, s7
+; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; TONGA-NEXT: s_mov_b32 s3, 0xf000
+; TONGA-NEXT: s_mov_b32 s2, -1
+; TONGA-NEXT: s_mov_b32 s10, s2
+; TONGA-NEXT: s_mov_b32 s11, s3
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
-; TONGA-NEXT: s_mov_b32 s8, s2
-; TONGA-NEXT: s_mov_b32 s9, s3
+; TONGA-NEXT: s_mov_b32 s8, s6
+; TONGA-NEXT: s_mov_b32 s9, s7
; TONGA-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 offset:1
; TONGA-NEXT: buffer_load_sbyte v1, off, s[8:11], 0
-; TONGA-NEXT: s_mov_b32 s4, s0
-; TONGA-NEXT: s_mov_b32 s5, s1
+; TONGA-NEXT: s_mov_b32 s0, s4
+; TONGA-NEXT: s_mov_b32 s1, s5
; TONGA-NEXT: s_waitcnt vmcnt(1)
; TONGA-NEXT: v_cvt_f32_i32_e32 v2, v0
; TONGA-NEXT: s_waitcnt vmcnt(0)
@@ -1510,23 +1510,23 @@ define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
; TONGA-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; TONGA-NEXT: v_add_u32_e32 v0, vcc, v0, v1
; TONGA-NEXT: v_bfe_i32 v0, v0, 0, 8
-; TONGA-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; TONGA-NEXT: buffer_store_dword v0, off, s[0:3], 0
; TONGA-NEXT: s_endpgm
;
; GFX9-LABEL: v_sdiv_i8:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s10, s6
-; GFX9-NEXT: s_mov_b32 s11, s7
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s10, s2
+; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s8, s2
-; GFX9-NEXT: s_mov_b32 s9, s3
+; GFX9-NEXT: s_mov_b32 s8, s6
+; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_sbyte v0, off, s[8:11], 0 offset:1
; GFX9-NEXT: buffer_load_sbyte v1, off, s[8:11], 0
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -1543,7 +1543,7 @@ define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
; GFX9-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
; GFX9-NEXT: v_add_u32_e32 v0, v4, v0
; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 8
-; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: v_sdiv_i8:
@@ -2221,21 +2221,21 @@ define amdgpu_kernel void @scalarize_mulhs_4xi32(ptr addrspace(1) nocapture read
;
; TONGA-LABEL: scalarize_mulhs_4xi32:
; TONGA: ; %bb.0:
-; TONGA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; TONGA-NEXT: s_mov_b32 s7, 0xf000
-; TONGA-NEXT: s_mov_b32 s6, -1
+; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; TONGA-NEXT: s_mov_b32 s3, 0xf000
+; TONGA-NEXT: s_mov_b32 s2, -1
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
-; TONGA-NEXT: s_mov_b32 s4, s0
-; TONGA-NEXT: s_mov_b32 s5, s1
-; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; TONGA-NEXT: s_mov_b32 s0, 0x1389c755
-; TONGA-NEXT: s_mov_b32 s4, s2
-; TONGA-NEXT: s_mov_b32 s5, s3
+; TONGA-NEXT: s_mov_b32 s0, s4
+; TONGA-NEXT: s_mov_b32 s1, s5
+; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; TONGA-NEXT: s_mov_b32 s4, 0x1389c755
+; TONGA-NEXT: s_mov_b32 s0, s6
+; TONGA-NEXT: s_mov_b32 s1, s7
; TONGA-NEXT: s_waitcnt vmcnt(0)
-; TONGA-NEXT: v_mul_hi_i32 v0, v0, s0
-; TONGA-NEXT: v_mul_hi_i32 v1, v1, s0
-; TONGA-NEXT: v_mul_hi_i32 v2, v2, s0
-; TONGA-NEXT: v_mul_hi_i32 v3, v3, s0
+; TONGA-NEXT: v_mul_hi_i32 v0, v0, s4
+; TONGA-NEXT: v_mul_hi_i32 v1, v1, s4
+; TONGA-NEXT: v_mul_hi_i32 v2, v2, s4
+; TONGA-NEXT: v_mul_hi_i32 v3, v3, s4
; TONGA-NEXT: v_lshrrev_b32_e32 v4, 31, v0
; TONGA-NEXT: v_ashrrev_i32_e32 v0, 12, v0
; TONGA-NEXT: v_lshrrev_b32_e32 v5, 31, v1
@@ -2248,26 +2248,26 @@ define amdgpu_kernel void @scalarize_mulhs_4xi32(ptr addrspace(1) nocapture read
; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v5
; TONGA-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v7
-; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; TONGA-NEXT: s_endpgm
;
; GFX9-LABEL: scalarize_mulhs_4xi32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b32 s3, 0xf000
+; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s4, s0
-; GFX9-NEXT: s_mov_b32 s5, s1
-; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; GFX9-NEXT: s_mov_b32 s0, 0x1389c755
-; GFX9-NEXT: s_mov_b32 s4, s2
-; GFX9-NEXT: s_mov_b32 s5, s3
+; GFX9-NEXT: s_mov_b32 s0, s4
+; GFX9-NEXT: s_mov_b32 s1, s5
+; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; GFX9-NEXT: s_mov_b32 s4, 0x1389c755
+; GFX9-NEXT: s_mov_b32 s0, s6
+; GFX9-NEXT: s_mov_b32 s1, s7
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mul_hi_i32 v0, v0, s0
-; GFX9-NEXT: v_mul_hi_i32 v1, v1, s0
-; GFX9-NEXT: v_mul_hi_i32 v2, v2, s0
-; GFX9-NEXT: v_mul_hi_i32 v3, v3, s0
+; GFX9-NEXT: v_mul_hi_i32 v0, v0, s4
+; GFX9-NEXT: v_mul_hi_i32 v1, v1, s4
+; GFX9-NEXT: v_mul_hi_i32 v2, v2, s4
+; GFX9-NEXT: v_mul_hi_i32 v3, v3, s4
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v0
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 12, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 31, v1
@@ -2280,7 +2280,7 @@ define amdgpu_kernel void @scalarize_mulhs_4xi32(ptr addrspace(1) nocapture read
; GFX9-NEXT: v_add_u32_e32 v1, v1, v5
; GFX9-NEXT: v_add_u32_e32 v2, v2, v6
; GFX9-NEXT: v_add_u32_e32 v3, v3, v7
-; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: scalarize_mulhs_4xi32:
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
index 0f2eedb1923d6..b271a036c6002 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -7,13 +7,13 @@
define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; NOSDWA-LABEL: add_shr_i32:
; NOSDWA: ; %bb.0:
-; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
-; NOSDWA-NEXT: v_mov_b32_e32 v0, s2
-; NOSDWA-NEXT: v_mov_b32_e32 v1, s3
+; NOSDWA-NEXT: v_mov_b32_e32 v0, s6
+; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: flat_load_dword v2, v[0:1]
-; NOSDWA-NEXT: v_mov_b32_e32 v0, s0
-; NOSDWA-NEXT: v_mov_b32_e32 v1, s1
+; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
+; NOSDWA-NEXT: v_mov_b32_e32 v1, s5
; NOSDWA-NEXT: s_waitcnt vmcnt(0)
; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, v3, v2
@@ -22,13 +22,13 @@ define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX89-LABEL: add_shr_i32:
; GFX89: ; %bb.0:
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: v_mov_b32_e32 v0, s2
-; GFX89-NEXT: v_mov_b32_e32 v1, s3
+; GFX89-NEXT: v_mov_b32_e32 v0, s6
+; GFX89-NEXT: v_mov_b32_e32 v1, s7
; GFX89-NEXT: flat_load_dword v2, v[0:1]
-; GFX89-NEXT: v_mov_b32_e32 v0, s0
-; GFX89-NEXT: v_mov_b32_e32 v1, s1
+; GFX89-NEXT: v_mov_b32_e32 v0, s4
+; GFX89-NEXT: v_mov_b32_e32 v1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_add_u32_sdwa v2, vcc, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX89-NEXT: flat_store_dword v[0:1], v2
@@ -36,24 +36,24 @@ define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: add_shr_i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: add_shr_i32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_add_nc_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
%a = load i32, ptr addrspace(1) %in, align 4
%shr = lshr i32 %a, 16
@@ -65,13 +65,13 @@ define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %
define amdgpu_kernel void @sub_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; NOSDWA-LABEL: sub_shr_i32:
; NOSDWA: ; %bb.0:
-; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
-; NOSDWA-NEXT: v_mov_b32_e32 v0, s2
-; NOSDWA-NEXT: v_mov_b32_e32 v1, s3
+; NOSDWA-NEXT: v_mov_b32_e32 v0, s6
+; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: flat_load_dword v2, v[0:1]
-; NOSDWA-NEXT: v_mov_b32_e32 v0, s0
-; NOSDWA-NEXT: v_mov_b32_e32 v1, s1
+; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
+; NOSDWA-NEXT: v_mov_b32_e32 v1, s5
; NOSDWA-NEXT: s_waitcnt vmcnt(0)
; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; NOSDWA-NEXT: v_sub_u32_e32 v2, vcc, v3, v2
@@ -80,13 +80,13 @@ define amdgpu_kernel void @sub_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX89-LABEL: sub_shr_i32:
; GFX89: ; %bb.0:
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: v_mov_b32_e32 v0, s2
-; GFX89-NEXT: v_mov_b32_e32 v1, s3
+; GFX89-NEXT: v_mov_b32_e32 v0, s6
+; GFX89-NEXT: v_mov_b32_e32 v1, s7
; GFX89-NEXT: flat_load_dword v2, v[0:1]
-; GFX89-NEXT: v_mov_b32_e32 v0, s0
-; GFX89-NEXT: v_mov_b32_e32 v1, s1
+; GFX89-NEXT: v_mov_b32_e32 v0, s4
+; GFX89-NEXT: v_mov_b32_e32 v1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_sub_u32_sdwa v2, vcc, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX89-NEXT: flat_store_dword v[0:1], v2
@@ -94,24 +94,24 @@ define amdgpu_kernel void @sub_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: sub_shr_i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sub_shr_i32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_sub_nc_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
%a = load i32, ptr addrspace(1) %in, align 4
%shr = lshr i32 %a, 16
@@ -124,14 +124,14 @@ define amdgpu_kernel void @mul_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; NOSDWA-LABEL: mul_shr_i32:
; NOSDWA: ; %bb.0:
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
-; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
+; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; NOSDWA-NEXT: flat_load_dword v4, v[0:1]
; NOSDWA-NEXT: flat_load_dword v2, v[2:3]
@@ -148,14 +148,14 @@ define amdgpu_kernel void @mul_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; GFX89-LABEL: mul_shr_i32:
; GFX89: ; %bb.0:
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v1, s7
; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
-; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
+; GFX89-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX89-NEXT: flat_load_dword v4, v[0:1]
; GFX89-NEXT: flat_load_dword v2, v[2:3]
@@ -211,14 +211,14 @@ define amdgpu_kernel void @mul_i16(ptr addrspace(1) %out, ptr addrspace(1) %ina,
; NOSDWA-LABEL: mul_i16:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
-; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
+; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; NOSDWA-NEXT: flat_load_ushort v4, v[0:1]
; NOSDWA-NEXT: flat_load_ushort v2, v[2:3]
@@ -232,14 +232,14 @@ define amdgpu_kernel void @mul_i16(ptr addrspace(1) %out, ptr addrspace(1) %ina,
; GFX89-LABEL: mul_i16:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v1, s7
; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
-; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
+; GFX89-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX89-NEXT: flat_load_ushort v4, v[0:1]
; GFX89-NEXT: flat_load_ushort v2, v[2:3]
@@ -294,14 +294,14 @@ define amdgpu_kernel void @mul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in
; NOSDWA-LABEL: mul_v2i16:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
-; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
+; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; NOSDWA-NEXT: flat_load_dword v4, v[0:1]
; NOSDWA-NEXT: flat_load_dword v2, v[2:3]
@@ -320,14 +320,14 @@ define amdgpu_kernel void @mul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in
; GFX89-LABEL: mul_v2i16:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v1, s7
; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
-; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
+; GFX89-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX89-NEXT: flat_load_dword v4, v[0:1]
; GFX89-NEXT: flat_load_dword v2, v[2:3]
@@ -384,14 +384,14 @@ define amdgpu_kernel void @mul_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in
; NOSDWA-LABEL: mul_v4i16:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
-; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
+; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; NOSDWA-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; NOSDWA-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
@@ -416,14 +416,14 @@ define amdgpu_kernel void @mul_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in
; GFX89-LABEL: mul_v4i16:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v1, s7
; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
-; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
+; GFX89-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX89-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GFX89-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
@@ -485,14 +485,14 @@ define amdgpu_kernel void @mul_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in
; NOSDWA-LABEL: mul_v8i16:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 4, v0
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
-; NOSDWA-NEXT: v_add_u32_e32 v4, vcc, s0, v2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
+; NOSDWA-NEXT: v_add_u32_e32 v4, vcc, s2, v2
; NOSDWA-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc
; NOSDWA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; NOSDWA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
@@ -529,14 +529,14 @@ define amdgpu_kernel void @mul_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in
; GFX89-LABEL: mul_v8i16:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: v_lshlrev_b32_e32 v2, 4, v0
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v1, s7
; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
-; GFX89-NEXT: v_add_u32_e32 v4, vcc, s0, v2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
+; GFX89-NEXT: v_add_u32_e32 v4, vcc, s2, v2
; GFX89-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc
; GFX89-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GFX89-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
@@ -608,12 +608,12 @@ define amdgpu_kernel void @mul_half(ptr addrspace(1) %out, ptr addrspace(1) %ina
; NOSDWA-LABEL: mul_half:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v0, s6
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
-; NOSDWA-NEXT: v_mov_b32_e32 v2, s0
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
+; NOSDWA-NEXT: v_mov_b32_e32 v2, s2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
; NOSDWA-NEXT: flat_load_ushort v4, v[0:1]
; NOSDWA-NEXT: flat_load_ushort v2, v[2:3]
; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
@@ -626,12 +626,12 @@ define amdgpu_kernel void @mul_half(ptr addrspace(1) %out, ptr addrspace(1) %ina
; GFX89-LABEL: mul_half:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v0, s6
; GFX89-NEXT: v_mov_b32_e32 v1, s7
-; GFX89-NEXT: v_mov_b32_e32 v2, s0
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
+; GFX89-NEXT: v_mov_b32_e32 v2, s2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
; GFX89-NEXT: flat_load_ushort v4, v[0:1]
; GFX89-NEXT: flat_load_ushort v2, v[2:3]
; GFX89-NEXT: v_mov_b32_e32 v0, s4
@@ -680,11 +680,11 @@ define amdgpu_kernel void @mul_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i
; NOSDWA-LABEL: mul_v2half:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v0, s6
-; NOSDWA-NEXT: v_mov_b32_e32 v2, s0
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
+; NOSDWA-NEXT: v_mov_b32_e32 v2, s2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: flat_load_dword v2, v[2:3]
; NOSDWA-NEXT: flat_load_dword v3, v[0:1]
@@ -704,12 +704,12 @@ define amdgpu_kernel void @mul_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX89-LABEL: mul_v2half:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v0, s6
; GFX89-NEXT: v_mov_b32_e32 v1, s7
-; GFX89-NEXT: v_mov_b32_e32 v2, s0
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
+; GFX89-NEXT: v_mov_b32_e32 v2, s2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
; GFX89-NEXT: flat_load_dword v4, v[0:1]
; GFX89-NEXT: flat_load_dword v2, v[2:3]
; GFX89-NEXT: v_mov_b32_e32 v0, s4
@@ -760,12 +760,12 @@ define amdgpu_kernel void @mul_v4half(ptr addrspace(1) %out, ptr addrspace(1) %i
; NOSDWA-LABEL: mul_v4half:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v0, s6
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
-; NOSDWA-NEXT: v_mov_b32_e32 v2, s0
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
+; NOSDWA-NEXT: v_mov_b32_e32 v2, s2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
; NOSDWA-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
; NOSDWA-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; NOSDWA-NEXT: v_mov_b32_e32 v4, s4
@@ -790,12 +790,12 @@ define amdgpu_kernel void @mul_v4half(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX89-LABEL: mul_v4half:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v0, s6
; GFX89-NEXT: v_mov_b32_e32 v1, s7
-; GFX89-NEXT: v_mov_b32_e32 v2, s0
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
+; GFX89-NEXT: v_mov_b32_e32 v2, s2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
; GFX89-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GFX89-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
; GFX89-NEXT: v_mov_b32_e32 v4, s4
@@ -851,12 +851,12 @@ define amdgpu_kernel void @mul_v8half(ptr addrspace(1) %out, ptr addrspace(1) %i
; NOSDWA-LABEL: mul_v8half:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v4, s6
; NOSDWA-NEXT: v_mov_b32_e32 v5, s7
-; NOSDWA-NEXT: v_mov_b32_e32 v0, s0
-; NOSDWA-NEXT: v_mov_b32_e32 v1, s1
+; NOSDWA-NEXT: v_mov_b32_e32 v0, s2
+; NOSDWA-NEXT: v_mov_b32_e32 v1, s3
; NOSDWA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; NOSDWA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; NOSDWA-NEXT: v_mov_b32_e32 v8, s4
@@ -893,12 +893,12 @@ define amdgpu_kernel void @mul_v8half(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX89-LABEL: mul_v8half:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v0, s6
; GFX89-NEXT: v_mov_b32_e32 v1, s7
-; GFX89-NEXT: v_mov_b32_e32 v4, s0
-; GFX89-NEXT: v_mov_b32_e32 v5, s1
+; GFX89-NEXT: v_mov_b32_e32 v4, s2
+; GFX89-NEXT: v_mov_b32_e32 v5, s3
; GFX89-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GFX89-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GFX89-NEXT: v_mov_b32_e32 v8, s4
@@ -964,13 +964,13 @@ define amdgpu_kernel void @mul_i8(ptr addrspace(1) %out, ptr addrspace(1) %ina,
; NOSDWA-LABEL: mul_i8:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v2, s7
; NOSDWA-NEXT: v_add_u32_e32 v1, vcc, s6, v0
; NOSDWA-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; NOSDWA-NEXT: v_mov_b32_e32 v4, s1
-; NOSDWA-NEXT: v_add_u32_e32 v3, vcc, s0, v0
+; NOSDWA-NEXT: v_mov_b32_e32 v4, s3
+; NOSDWA-NEXT: v_add_u32_e32 v3, vcc, s2, v0
; NOSDWA-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
; NOSDWA-NEXT: flat_load_ubyte v2, v[1:2]
; NOSDWA-NEXT: flat_load_ubyte v3, v[3:4]
@@ -984,13 +984,13 @@ define amdgpu_kernel void @mul_i8(ptr addrspace(1) %out, ptr addrspace(1) %ina,
; GFX89-LABEL: mul_i8:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v2, s7
; GFX89-NEXT: v_add_u32_e32 v1, vcc, s6, v0
; GFX89-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; GFX89-NEXT: v_mov_b32_e32 v4, s1
-; GFX89-NEXT: v_add_u32_e32 v3, vcc, s0, v0
+; GFX89-NEXT: v_mov_b32_e32 v4, s3
+; GFX89-NEXT: v_add_u32_e32 v3, vcc, s2, v0
; GFX89-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
; GFX89-NEXT: flat_load_ubyte v2, v[1:2]
; GFX89-NEXT: flat_load_ubyte v3, v[3:4]
@@ -1043,14 +1043,14 @@ define amdgpu_kernel void @mul_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %ina
; NOSDWA-LABEL: mul_v2i8:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
-; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
+; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; NOSDWA-NEXT: flat_load_ushort v4, v[0:1]
; NOSDWA-NEXT: flat_load_ushort v2, v[2:3]
@@ -1071,14 +1071,14 @@ define amdgpu_kernel void @mul_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %ina
; GFX89-LABEL: mul_v2i8:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v1, s7
; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
-; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
+; GFX89-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX89-NEXT: flat_load_ushort v4, v[0:1]
; GFX89-NEXT: flat_load_ushort v2, v[2:3]
@@ -1143,15 +1143,15 @@ define amdgpu_kernel void @mul_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %ina
; NOSDWA-LABEL: mul_v4i8:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
; NOSDWA-NEXT: flat_load_dword v4, v[0:1]
-; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; NOSDWA-NEXT: flat_load_dword v2, v[0:1]
; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
@@ -1183,14 +1183,14 @@ define amdgpu_kernel void @mul_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %ina
; GFX89-LABEL: mul_v4i8:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v1, s7
; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
-; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
+; GFX89-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX89-NEXT: flat_load_dword v4, v[0:1]
; GFX89-NEXT: flat_load_dword v2, v[2:3]
@@ -1272,14 +1272,14 @@ define amdgpu_kernel void @mul_v8i8(ptr addrspace(1) %out, ptr addrspace(1) %ina
; NOSDWA-LABEL: mul_v8i8:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
-; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
+; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; NOSDWA-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; NOSDWA-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
@@ -1331,14 +1331,14 @@ define amdgpu_kernel void @mul_v8i8(ptr addrspace(1) %out, ptr addrspace(1) %ina
; GFX89-LABEL: mul_v8i8:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v1, s7
; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
-; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
+; GFX89-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX89-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GFX89-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
@@ -1449,13 +1449,13 @@ entry:
define amdgpu_kernel void @sitofp_v2i16_to_v2f16(
; NOSDWA-LABEL: sitofp_v2i16_to_v2f16:
; NOSDWA: ; %bb.0: ; %entry
-; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
-; NOSDWA-NEXT: v_mov_b32_e32 v0, s2
-; NOSDWA-NEXT: v_mov_b32_e32 v1, s3
+; NOSDWA-NEXT: v_mov_b32_e32 v0, s6
+; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: flat_load_dword v2, v[0:1]
-; NOSDWA-NEXT: v_mov_b32_e32 v0, s0
-; NOSDWA-NEXT: v_mov_b32_e32 v1, s1
+; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
+; NOSDWA-NEXT: v_mov_b32_e32 v1, s5
; NOSDWA-NEXT: s_waitcnt vmcnt(0)
; NOSDWA-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; NOSDWA-NEXT: v_cvt_f16_i16_e32 v3, v3
@@ -1467,13 +1467,13 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16(
;
; GFX89-LABEL: sitofp_v2i16_to_v2f16:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: v_mov_b32_e32 v0, s2
-; GFX89-NEXT: v_mov_b32_e32 v1, s3
+; GFX89-NEXT: v_mov_b32_e32 v0, s6
+; GFX89-NEXT: v_mov_b32_e32 v1, s7
; GFX89-NEXT: flat_load_dword v2, v[0:1]
-; GFX89-NEXT: v_mov_b32_e32 v0, s0
-; GFX89-NEXT: v_mov_b32_e32 v1, s1
+; GFX89-NEXT: v_mov_b32_e32 v0, s4
+; GFX89-NEXT: v_mov_b32_e32 v1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_cvt_f16_i16_sdwa v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX89-NEXT: v_cvt_f16_i16_e32 v2, v2
@@ -1483,29 +1483,29 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16(
;
; GFX9-LABEL: sitofp_v2i16_to_v2f16:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cvt_f16_i16_e32 v2, v1
; GFX9-NEXT: v_cvt_f16_i16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: sitofp_v2i16_to_v2f16:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_cvt_f16_i16_e32 v2, v1
; GFX10-NEXT: v_cvt_f16_i16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX10-NEXT: v_lshl_or_b32 v1, v1, 16, v2
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) #0 {
@@ -1520,11 +1520,11 @@ define amdgpu_kernel void @mac_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i
; NOSDWA-LABEL: mac_v2half:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v0, s6
-; NOSDWA-NEXT: v_mov_b32_e32 v2, s0
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
+; NOSDWA-NEXT: v_mov_b32_e32 v2, s2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: flat_load_dword v2, v[2:3]
; NOSDWA-NEXT: flat_load_dword v3, v[0:1]
@@ -1544,11 +1544,11 @@ define amdgpu_kernel void @mac_v2half(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX89-LABEL: mac_v2half:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v0, s6
-; GFX89-NEXT: v_mov_b32_e32 v2, s0
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
+; GFX89-NEXT: v_mov_b32_e32 v2, s2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
; GFX89-NEXT: v_mov_b32_e32 v1, s7
; GFX89-NEXT: flat_load_dword v2, v[2:3]
; GFX89-NEXT: flat_load_dword v3, v[0:1]
@@ -1605,15 +1605,15 @@ entry:
define amdgpu_kernel void @immediate_mul_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; NOSDWA-LABEL: immediate_mul_v2i16:
; NOSDWA: ; %bb.0: ; %entry
-; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; NOSDWA-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
-; NOSDWA-NEXT: v_mov_b32_e32 v1, s3
-; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
+; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; NOSDWA-NEXT: flat_load_dword v2, v[0:1]
-; NOSDWA-NEXT: v_mov_b32_e32 v0, s0
-; NOSDWA-NEXT: v_mov_b32_e32 v1, s1
+; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
+; NOSDWA-NEXT: v_mov_b32_e32 v1, s5
; NOSDWA-NEXT: s_waitcnt vmcnt(0)
; NOSDWA-NEXT: v_mul_lo_u16_e32 v3, 0x7b, v2
; NOSDWA-NEXT: v_lshrrev_b32_e32 v2, 16, v2
@@ -1625,16 +1625,16 @@ define amdgpu_kernel void @immediate_mul_v2i16(ptr addrspace(1) %out, ptr addrsp
;
; GFX89-LABEL: immediate_mul_v2i16:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX89-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX89-NEXT: v_mov_b32_e32 v3, 0x141
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: v_mov_b32_e32 v1, s3
-; GFX89-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; GFX89-NEXT: v_mov_b32_e32 v1, s7
+; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX89-NEXT: flat_load_dword v2, v[0:1]
-; GFX89-NEXT: v_mov_b32_e32 v0, s0
-; GFX89-NEXT: v_mov_b32_e32 v1, s1
+; GFX89-NEXT: v_mov_b32_e32 v0, s4
+; GFX89-NEXT: v_mov_b32_e32 v1, s5
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_mul_lo_u16_e32 v4, 0x7b, v2
; GFX89-NEXT: v_mul_lo_u16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -1644,27 +1644,27 @@ define amdgpu_kernel void @immediate_mul_v2i16(ptr addrspace(1) %out, ptr addrsp
;
; GFX9-LABEL: immediate_mul_v2i16:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT: s_mov_b32 s0, 0x141007b
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[2:3]
-; GFX9-NEXT: s_mov_b32 s2, 0x141007b
+; GFX9-NEXT: global_load_dword v0, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, s2
-; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, s0
+; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: immediate_mul_v2i16:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_mul_lo_u16 v0, 0x141007b, v0
-; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -1680,14 +1680,14 @@ define amdgpu_kernel void @mulmul_v2i16(ptr addrspace(1) %out, ptr addrspace(1)
; NOSDWA-LABEL: mulmul_v2i16:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; NOSDWA-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
-; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
+; NOSDWA-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; NOSDWA-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; NOSDWA-NEXT: flat_load_dword v2, v[2:3]
; NOSDWA-NEXT: flat_load_dword v3, v[0:1]
@@ -1709,14 +1709,14 @@ define amdgpu_kernel void @mulmul_v2i16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX89-LABEL: mulmul_v2i16:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v1, s7
; GFX89-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX89-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
-; GFX89-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
+; GFX89-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; GFX89-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX89-NEXT: flat_load_dword v4, v[0:1]
; GFX89-NEXT: flat_load_dword v2, v[2:3]
@@ -1778,12 +1778,12 @@ define amdgpu_kernel void @add_bb_v2i16(ptr addrspace(1) %out, ptr addrspace(1)
; NOSDWA-LABEL: add_bb_v2i16:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v0, s6
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
-; NOSDWA-NEXT: v_mov_b32_e32 v2, s0
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
+; NOSDWA-NEXT: v_mov_b32_e32 v2, s2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
; NOSDWA-NEXT: flat_load_dword v1, v[0:1]
; NOSDWA-NEXT: flat_load_dword v2, v[2:3]
; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
@@ -1803,12 +1803,12 @@ define amdgpu_kernel void @add_bb_v2i16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX89-LABEL: add_bb_v2i16:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v0, s6
; GFX89-NEXT: v_mov_b32_e32 v1, s7
-; GFX89-NEXT: v_mov_b32_e32 v2, s0
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
+; GFX89-NEXT: v_mov_b32_e32 v2, s2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
; GFX89-NEXT: flat_load_dword v1, v[0:1]
; GFX89-NEXT: flat_load_dword v2, v[2:3]
; GFX89-NEXT: v_mov_b32_e32 v0, s4
@@ -1863,13 +1863,13 @@ store_label:
define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrspace(1) %destValues) #0 {
; NOSDWA-LABEL: pulled_out_test:
; NOSDWA: ; %bb.0: ; %entry
-; NOSDWA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
-; NOSDWA-NEXT: v_mov_b32_e32 v0, s0
-; NOSDWA-NEXT: v_mov_b32_e32 v1, s1
+; NOSDWA-NEXT: v_mov_b32_e32 v0, s4
+; NOSDWA-NEXT: v_mov_b32_e32 v1, s5
; NOSDWA-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; NOSDWA-NEXT: v_mov_b32_e32 v2, s2
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
+; NOSDWA-NEXT: v_mov_b32_e32 v2, s6
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s7
; NOSDWA-NEXT: s_waitcnt vmcnt(0)
; NOSDWA-NEXT: v_and_b32_e32 v4, 0xff, v0
; NOSDWA-NEXT: v_lshrrev_b32_e32 v5, 8, v0
@@ -1900,15 +1900,15 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp
;
; GFX89-LABEL: pulled_out_test:
; GFX89: ; %bb.0: ; %entry
-; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX89-NEXT: v_mov_b32_e32 v4, 8
; GFX89-NEXT: v_mov_b32_e32 v5, 0xff
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
-; GFX89-NEXT: v_mov_b32_e32 v0, s0
-; GFX89-NEXT: v_mov_b32_e32 v1, s1
+; GFX89-NEXT: v_mov_b32_e32 v0, s4
+; GFX89-NEXT: v_mov_b32_e32 v1, s5
; GFX89-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX89-NEXT: v_mov_b32_e32 v2, s2
-; GFX89-NEXT: v_mov_b32_e32 v3, s3
+; GFX89-NEXT: v_mov_b32_e32 v2, s6
+; GFX89-NEXT: v_mov_b32_e32 v3, s7
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: v_lshrrev_b32_sdwa v6, v4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX89-NEXT: v_lshrrev_b32_e32 v7, 24, v0
@@ -1929,12 +1929,12 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp
;
; GFX9-LABEL: pulled_out_test:
; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_mov_b32_e32 v3, 8
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
; GFX9-NEXT: s_movk_i32 s0, 0xff
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_sdwa v4, v3, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v0
@@ -1950,18 +1950,18 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp
; GFX9-NEXT: v_or_b32_sdwa v3, v8, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: pulled_out_test:
; GFX10: ; %bb.0: ; %entry
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: v_mov_b32_e32 v3, 8
; GFX10-NEXT: v_mov_b32_e32 v4, 24
; GFX10-NEXT: v_mov_b32_e32 v5, 0xff
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_sdwa v6, v3, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_lshrrev_b32_sdwa v7, v4, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -1975,7 +1975,7 @@ define amdgpu_kernel void @pulled_out_test(ptr addrspace(1) %sourceA, ptr addrsp
; GFX10-NEXT: v_or_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
; GFX10-NEXT: s_endpgm
entry:
%idxprom = ashr exact i64 15, 32
@@ -2207,11 +2207,11 @@ define amdgpu_kernel void @mac_v2half_same_srcop(ptr addrspace(1) %out, ptr addr
; NOSDWA-LABEL: mac_v2half_same_srcop:
; NOSDWA: ; %bb.0: ; %entry
; NOSDWA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; NOSDWA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; NOSDWA-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; NOSDWA-NEXT: s_waitcnt lgkmcnt(0)
; NOSDWA-NEXT: v_mov_b32_e32 v0, s6
-; NOSDWA-NEXT: v_mov_b32_e32 v2, s0
-; NOSDWA-NEXT: v_mov_b32_e32 v3, s1
+; NOSDWA-NEXT: v_mov_b32_e32 v2, s2
+; NOSDWA-NEXT: v_mov_b32_e32 v3, s3
; NOSDWA-NEXT: v_mov_b32_e32 v1, s7
; NOSDWA-NEXT: flat_load_dword v2, v[2:3]
; NOSDWA-NEXT: flat_load_dword v3, v[0:1]
@@ -2231,12 +2231,12 @@ define amdgpu_kernel void @mac_v2half_same_srcop(ptr addrspace(1) %out, ptr addr
; GFX89-LABEL: mac_v2half_same_srcop:
; GFX89: ; %bb.0: ; %entry
; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX89-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX89-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX89-NEXT: s_waitcnt lgkmcnt(0)
; GFX89-NEXT: v_mov_b32_e32 v0, s6
; GFX89-NEXT: v_mov_b32_e32 v1, s7
-; GFX89-NEXT: v_mov_b32_e32 v2, s0
-; GFX89-NEXT: v_mov_b32_e32 v3, s1
+; GFX89-NEXT: v_mov_b32_e32 v2, s2
+; GFX89-NEXT: v_mov_b32_e32 v3, s3
; GFX89-NEXT: flat_load_dword v4, v[0:1]
; GFX89-NEXT: flat_load_dword v2, v[2:3]
; GFX89-NEXT: v_mov_b32_e32 v0, s4
diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll
index 0992e9e300f13..53b78bdbeb6ab 100644
--- a/llvm/test/CodeGen/AMDGPU/select.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll
@@ -85,17 +85,17 @@ define amdgpu_kernel void @select_f16(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x44
-; GFX11-NEXT: s_mov_b32 s14, -1
-; GFX11-NEXT: s_mov_b32 s15, 0x31016000
-; GFX11-NEXT: s_mov_b32 s18, s14
-; GFX11-NEXT: s_mov_b32 s19, s15
-; GFX11-NEXT: s_mov_b32 s22, s14
-; GFX11-NEXT: s_mov_b32 s23, s15
-; GFX11-NEXT: s_mov_b32 s26, s14
-; GFX11-NEXT: s_mov_b32 s27, s15
-; GFX11-NEXT: s_mov_b32 s2, s14
-; GFX11-NEXT: s_mov_b32 s3, s15
+; GFX11-NEXT: s_load_b64 s[12:13], s[0:1], 0x44
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
+; GFX11-NEXT: s_mov_b32 s26, s2
+; GFX11-NEXT: s_mov_b32 s27, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s16, s6
; GFX11-NEXT: s_mov_b32 s17, s7
@@ -109,13 +109,13 @@ define amdgpu_kernel void @select_f16(
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v2, off, s[24:27], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v3, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v3, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s4
-; GFX11-NEXT: s_mov_b32 s13, s5
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo
-; GFX11-NEXT: buffer_store_b16 v0, off, s[12:15], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -174,64 +174,64 @@ define amdgpu_kernel void @select_f16_imm_a(
;
; VI-LABEL: select_f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: select_f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -287,64 +287,64 @@ define amdgpu_kernel void @select_f16_imm_b(
;
; VI-LABEL: select_f16_imm_b:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: select_f16_imm_b:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0
; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -400,65 +400,65 @@ define amdgpu_kernel void @select_f16_imm_c(
;
; VI-LABEL: select_f16_imm_c:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, 0x3800
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: select_f16_imm_c:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -514,65 +514,65 @@ define amdgpu_kernel void @select_f16_imm_d(
;
; VI-LABEL: select_f16_imm_d:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 glc
+; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, 0x3800
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
-; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: select_f16_imm_d:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1
; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -801,28 +801,28 @@ define amdgpu_kernel void @select_v2f16_imm_a(
;
; VI-LABEL: select_v2f16_imm_a:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0
; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0
-; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
-; VI-NEXT: s_movk_i32 s2, 0x3900
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: buffer_load_dword v2, off, s[8:11], 0
+; VI-NEXT: s_movk_i32 s6, 0x3900
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0
@@ -830,36 +830,36 @@ define amdgpu_kernel void @select_v2f16_imm_a(
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_cmp_lt_f16_e32 vcc, s2, v3
+; VI-NEXT: v_cmp_lt_f16_e32 vcc, s6, v3
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: select_v2f16_imm_a:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0
; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0
@@ -874,7 +874,7 @@ define amdgpu_kernel void @select_v2f16_imm_a(
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -942,28 +942,28 @@ define amdgpu_kernel void @select_v2f16_imm_b(
;
; VI-LABEL: select_v2f16_imm_b:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s15, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0
; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0
-; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
-; VI-NEXT: s_movk_i32 s2, 0x3900
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: buffer_load_dword v2, off, s[8:11], 0
+; VI-NEXT: s_movk_i32 s6, 0x3900
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0
@@ -971,36 +971,36 @@ define amdgpu_kernel void @select_v2f16_imm_b(
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; VI-NEXT: v_cmp_gt_f16_e32 vcc, s2, v3
+; VI-NEXT: v_cmp_gt_f16_e32 vcc, s6, v3
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: select_v2f16_imm_b:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0
; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0
@@ -1015,7 +1015,7 @@ define amdgpu_kernel void @select_v2f16_imm_b(
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1083,29 +1083,29 @@ define amdgpu_kernel void @select_v2f16_imm_c(
;
; VI-LABEL: select_v2f16_imm_c:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s15, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0
; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
-; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_dword v2, off, s[8:11], 0
; VI-NEXT: v_mov_b32_e32 v3, 0x3800
; VI-NEXT: v_mov_b32_e32 v4, 0x3900
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -1118,32 +1118,32 @@ define amdgpu_kernel void @select_v2f16_imm_c(
; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: select_v2f16_imm_c:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_b32 v0, off, s[16:19], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(1)
@@ -1158,7 +1158,7 @@ define amdgpu_kernel void @select_v2f16_imm_c(
; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3900, v5, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1226,29 +1226,29 @@ define amdgpu_kernel void @select_v2f16_imm_d(
;
; VI-LABEL: select_v2f16_imm_d:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s11, 0xf000
-; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s18, s10
-; VI-NEXT: s_mov_b32 s19, s11
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s18, s2
+; VI-NEXT: s_mov_b32 s19, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s16, s4
-; VI-NEXT: s_mov_b32 s17, s5
-; VI-NEXT: s_mov_b32 s14, s10
-; VI-NEXT: s_mov_b32 s12, s2
-; VI-NEXT: s_mov_b32 s13, s3
-; VI-NEXT: s_mov_b32 s15, s11
-; VI-NEXT: s_mov_b32 s4, s6
-; VI-NEXT: s_mov_b32 s5, s7
-; VI-NEXT: s_mov_b32 s6, s10
+; VI-NEXT: s_mov_b32 s16, s8
+; VI-NEXT: s_mov_b32 s17, s9
+; VI-NEXT: s_mov_b32 s14, s2
+; VI-NEXT: s_mov_b32 s12, s6
+; VI-NEXT: s_mov_b32 s13, s7
+; VI-NEXT: s_mov_b32 s15, s3
+; VI-NEXT: s_mov_b32 s8, s10
+; VI-NEXT: s_mov_b32 s9, s11
+; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0
; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
-; VI-NEXT: s_mov_b32 s7, s11
-; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s11, s3
+; VI-NEXT: buffer_load_dword v2, off, s[8:11], 0
; VI-NEXT: v_mov_b32_e32 v3, 0x3800
; VI-NEXT: v_mov_b32_e32 v4, 0x3900
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; VI-NEXT: s_waitcnt vmcnt(1)
@@ -1261,32 +1261,32 @@ define amdgpu_kernel void @select_v2f16_imm_d(
; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: select_v2f16_imm_d:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s18, s10
-; GFX11-NEXT: s_mov_b32 s19, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s22, s10
-; GFX11-NEXT: s_mov_b32 s23, s11
+; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s16, s4
-; GFX11-NEXT: s_mov_b32 s17, s5
-; GFX11-NEXT: s_mov_b32 s12, s2
-; GFX11-NEXT: s_mov_b32 s13, s3
-; GFX11-NEXT: s_mov_b32 s20, s6
-; GFX11-NEXT: s_mov_b32 s21, s7
+; GFX11-NEXT: s_mov_b32 s16, s8
+; GFX11-NEXT: s_mov_b32 s17, s9
+; GFX11-NEXT: s_mov_b32 s12, s6
+; GFX11-NEXT: s_mov_b32 s13, s7
+; GFX11-NEXT: s_mov_b32 s20, s10
+; GFX11-NEXT: s_mov_b32 s21, s11
; GFX11-NEXT: buffer_load_b32 v0, off, s[16:19], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0
-; GFX11-NEXT: s_mov_b32 s8, s0
-; GFX11-NEXT: s_mov_b32 s9, s1
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-NEXT: s_waitcnt vmcnt(1)
@@ -1301,7 +1301,7 @@ define amdgpu_kernel void @select_v2f16_imm_d(
; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3900, v5, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
index b3f4790df4d48..232c05eab6b5c 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -29,17 +29,17 @@ define amdgpu_kernel void @shl_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in
;
; VI-LABEL: shl_v2i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b32 s5, s5, s7
-; VI-NEXT: s_lshl_b32 s4, s4, s6
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b32 s1, s1, s3
+; VI-NEXT: s_lshl_b32 s0, s0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: shl_v2i32:
@@ -159,21 +159,21 @@ define amdgpu_kernel void @shl_i16(ptr addrspace(1) %out, ptr addrspace(1) %in)
;
; VI-LABEL: shl_i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:2
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: shl_i16:
@@ -396,29 +396,29 @@ define amdgpu_kernel void @shl_i16_computed_amount(ptr addrspace(1) %out, ptr ad
;
; VI-LABEL: shl_i16_computed_amount:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_ushort v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_add_u16_e32 v0, 3, v0
; VI-NEXT: v_lshlrev_b16_e32 v0, v0, v2
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: shl_i16_computed_amount:
@@ -484,14 +484,14 @@ define amdgpu_kernel void @shl_i16_i_s(ptr addrspace(1) %out, i16 zeroext %a) {
;
; VI-LABEL: shl_i16_i_s:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b32 s4, s4, 12
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT: s_lshl_b32 s0, s2, 12
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: shl_i16_i_s:
@@ -561,26 +561,26 @@ define amdgpu_kernel void @shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in
;
; VI-LABEL: shl_v2i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_load_dword s4, s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s5, s4, 16
-; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: s_lshr_b32 s1, s0, 16
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_lshlrev_b16_e64 v2, v0, s4
+; VI-NEXT: v_lshlrev_b16_e64 v2, v0, s0
; VI-NEXT: v_lshlrev_b16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v2, v0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: shl_v2i16:
@@ -659,15 +659,15 @@ define amdgpu_kernel void @shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in
;
; VI-LABEL: shl_v4i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v6, v3, v1
@@ -770,16 +770,16 @@ define amdgpu_kernel void @shl_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
;
; VI-LABEL: shl_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s6
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: shl_i64:
@@ -1041,14 +1041,14 @@ define amdgpu_kernel void @s_shl_32_i64(ptr addrspace(1) %out, [8 x i32], i64 %a
;
; VI-LABEL: s_shl_32_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x4c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s4
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_shl_32_i64:
@@ -1153,18 +1153,18 @@ define amdgpu_kernel void @s_shl_constant_i64(ptr addrspace(1) %out, i64 %a) {
;
; VI-LABEL: s_shl_constant_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_mov_b32 s9, 0xffff
-; VI-NEXT: s_mov_b32 s8, s6
-; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s8, s2
+; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: s_lshl_b64 s[0:1], s[8:9], s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_lshl_b64 s[4:5], s[8:9], s6
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_shl_constant_i64:
@@ -1215,20 +1215,20 @@ define amdgpu_kernel void @v_shl_constant_i64(ptr addrspace(1) %out, ptr addrspa
;
; VI-LABEL: v_shl_constant_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: s_mov_b32 s0, 0xab19b207
-; VI-NEXT: s_movk_i32 s1, 0x11e
+; VI-NEXT: s_load_dword s6, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_mov_b32 s4, 0xab19b207
+; VI-NEXT: s_movk_i32 s5, 0x11e
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], s6
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: v_shl_constant_i64:
@@ -1285,16 +1285,16 @@ define amdgpu_kernel void @v_shl_i64_32_bit_constant(ptr addrspace(1) %out, ptr
;
; VI-LABEL: v_shl_i64_32_bit_constant:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s4, s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], 0x12d687, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], 0x12d687, s0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: v_shl_i64_32_bit_constant:
@@ -1349,16 +1349,16 @@ define amdgpu_kernel void @v_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad
;
; VI-LABEL: v_shl_inline_imm_64_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s4, s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s0, s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], 64, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], 64, s0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: v_shl_inline_imm_64_i64:
@@ -1407,15 +1407,15 @@ define amdgpu_kernel void @s_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad
;
; VI-LABEL: s_shl_inline_imm_64_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], 64, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], 64, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_shl_inline_imm_64_i64:
@@ -1457,15 +1457,15 @@ define amdgpu_kernel void @s_shl_inline_imm_1_i64(ptr addrspace(1) %out, ptr add
;
; VI-LABEL: s_shl_inline_imm_1_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], 1, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], 1, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_shl_inline_imm_1_i64:
@@ -1508,15 +1508,15 @@ define amdgpu_kernel void @s_shl_inline_imm_1_0_i64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: s_shl_inline_imm_1_0_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], 1.0, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], 1.0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_shl_inline_imm_1_0_i64:
@@ -1555,15 +1555,15 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_1_0_i64(ptr addrspace(1) %out, p
;
; VI-LABEL: s_shl_inline_imm_neg_1_0_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], -1.0, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], -1.0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_shl_inline_imm_neg_1_0_i64:
@@ -1602,15 +1602,15 @@ define amdgpu_kernel void @s_shl_inline_imm_0_5_i64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: s_shl_inline_imm_0_5_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], 0.5, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], 0.5, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_shl_inline_imm_0_5_i64:
@@ -1649,15 +1649,15 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_0_5_i64(ptr addrspace(1) %out, p
;
; VI-LABEL: s_shl_inline_imm_neg_0_5_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], -0.5, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], -0.5, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_shl_inline_imm_neg_0_5_i64:
@@ -1696,15 +1696,15 @@ define amdgpu_kernel void @s_shl_inline_imm_2_0_i64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: s_shl_inline_imm_2_0_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], 2.0, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], 2.0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_shl_inline_imm_2_0_i64:
@@ -1743,15 +1743,15 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_2_0_i64(ptr addrspace(1) %out, p
;
; VI-LABEL: s_shl_inline_imm_neg_2_0_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], -2.0, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], -2.0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_shl_inline_imm_neg_2_0_i64:
@@ -1790,15 +1790,15 @@ define amdgpu_kernel void @s_shl_inline_imm_4_0_i64(ptr addrspace(1) %out, ptr a
;
; VI-LABEL: s_shl_inline_imm_4_0_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], 4.0, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], 4.0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_shl_inline_imm_4_0_i64:
@@ -1837,15 +1837,15 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_4_0_i64(ptr addrspace(1) %out, p
;
; VI-LABEL: s_shl_inline_imm_neg_4_0_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], -4.0, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], -4.0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_shl_inline_imm_neg_4_0_i64:
@@ -1887,15 +1887,15 @@ define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(ptr addrspace(1) %out, p
;
; VI-LABEL: s_shl_inline_imm_f32_4_0_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshl_b64 s[4:5], 0x40800000, s4
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshl_b64 s[0:1], 0x40800000, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_shl_inline_imm_f32_4_0_i64:
diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
index 1384fb0e0203a..05948d8c80691 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
@@ -90,26 +90,26 @@ define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2
define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: v_shl_v2i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshlrev_b16 v0, v1, v0
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_shl_v2i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v4, v1, v0
@@ -142,24 +142,24 @@ define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX10-LABEL: v_shl_v2i16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_lshlrev_b16 v0, v1, v0
-; GFX10-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX10-NEXT: global_store_dword v2, v0, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_shl_v2i16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_lshlrev_b16 v0, v1, v0
-; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v2, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -364,27 +364,27 @@ define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1)
define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: shl_imm_v_v2i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: shl_imm_v_v2i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: v_mov_b32_e32 v4, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e64 v2, v3, 8
@@ -416,24 +416,24 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(
;
; GFX10-LABEL: shl_imm_v_v2i16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0]
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: shl_imm_v_v2i16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -450,26 +450,26 @@ define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: shl_v_imm_v2i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: shl_v_imm_v2i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3
@@ -498,24 +498,24 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(
;
; GFX10-LABEL: shl_v_imm_v2i16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: shl_v_imm_v2i16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -532,27 +532,27 @@ define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(
define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: v_shl_v4i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshlrev_b16 v1, v3, v1
; GFX9-NEXT: v_pk_lshlrev_b16 v0, v2, v0
-; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_shl_v4i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v6, v3, v1
@@ -595,26 +595,26 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX10-LABEL: v_shl_v4i16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
+; GFX10-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_lshlrev_b16 v1, v3, v1
; GFX10-NEXT: v_pk_lshlrev_b16 v0, v2, v0
-; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_shl_v4i16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3]
+; GFX11-NEXT: global_load_b128 v[0:3], v4, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_lshlrev_b16 v1, v3, v1
; GFX11-NEXT: v_pk_lshlrev_b16 v0, v2, v0
-; GFX11-NEXT: global_store_b64 v4, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v4, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -633,27 +633,27 @@ define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %
define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX9-LABEL: shl_v_imm_v4i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: shl_v_imm_v4i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v1
@@ -692,26 +692,26 @@ define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(
;
; GFX10-LABEL: shl_v_imm_v4i16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: shl_v_imm_v4i16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
index 1a55bf608ebf5..5af7dfe7b31a5 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
@@ -48,15 +48,15 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp
;
; VI-SDAG-LABEL: v_test_i32_x_sub_64:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3
@@ -65,16 +65,16 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp
;
; VI-GISEL-LABEL: v_test_i32_x_sub_64:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -84,35 +84,35 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp
;
; GFX9-LABEL: v_test_i32_x_sub_64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_subrev_u32_e32 v1, 64, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i32_x_sub_64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 64, v1
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_i32_x_sub_64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 64, v1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -173,18 +173,18 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out,
;
; VI-SDAG-LABEL: v_test_i32_x_sub_64_multi_use:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_dword v4, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3
; VI-SDAG-NEXT: v_subrev_u32_e32 v3, vcc, 64, v4
@@ -196,19 +196,19 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out,
;
; VI-GISEL-LABEL: v_test_i32_x_sub_64_multi_use:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_dword v4, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3
@@ -221,52 +221,52 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_test_i32_x_sub_64_multi_use:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v2, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_subrev_u32_e32 v1, 64, v1
; GFX9-NEXT: v_subrev_u32_e32 v2, 64, v2
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v2, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i32_x_sub_64_multi_use:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v2, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 64, v1
; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 64, v2
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_store_dword v0, v2, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v2, s[4:5]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_i32_x_sub_64_multi_use:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 64, v1
; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 64, v2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5] dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] dlc
+; GFX11-NEXT: global_store_b32 v0, v2, s[4:5] dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -319,15 +319,15 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp
;
; VI-SDAG-LABEL: v_test_i32_64_sub_x:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_sub_u32_e32 v2, vcc, 64, v3
@@ -336,16 +336,16 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp
;
; VI-GISEL-LABEL: v_test_i32_64_sub_x:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -355,35 +355,35 @@ define amdgpu_kernel void @v_test_i32_64_sub_x(ptr addrspace(1) %out, ptr addrsp
;
; GFX9-LABEL: v_test_i32_64_sub_x:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u32_e32 v1, 64, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i32_64_sub_x:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_sub_nc_u32_e32 v1, 64, v1
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_i32_64_sub_x:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_sub_nc_u32_e32 v1, 64, v1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -432,15 +432,15 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp
;
; VI-SDAG-LABEL: v_test_i32_x_sub_65:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, 0xffffffbf, v3
@@ -449,16 +449,16 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp
;
; VI-GISEL-LABEL: v_test_i32_x_sub_65:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -468,70 +468,70 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp
;
; GFX9-SDAG-LABEL: v_test_i32_x_sub_65:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_add_u32_e32 v1, 0xffffffbf, v1
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_i32_x_sub_65:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 0x41, v1
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: v_test_i32_x_sub_65:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: v_add_nc_u32_e32 v1, 0xffffffbf, v1
-; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-SDAG-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: v_test_i32_x_sub_65:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 0x41, v1
-; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_i32_x_sub_65:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v1, 0xffffffbf, v1
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_i32_x_sub_65:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 0x41, v1
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -580,15 +580,15 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp
;
; VI-SDAG-LABEL: v_test_i32_65_sub_x:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_sub_u32_e32 v2, vcc, 0x41, v3
@@ -597,16 +597,16 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp
;
; VI-GISEL-LABEL: v_test_i32_65_sub_x:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -616,35 +616,35 @@ define amdgpu_kernel void @v_test_i32_65_sub_x(ptr addrspace(1) %out, ptr addrsp
;
; GFX9-LABEL: v_test_i32_65_sub_x:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u32_e32 v1, 0x41, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i32_65_sub_x:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0x41, v1
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_i32_65_sub_x:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0x41, v1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -693,15 +693,15 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add
;
; VI-SDAG-LABEL: v_test_i32_x_sub_neg16:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, 16, v3
@@ -710,16 +710,16 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add
;
; VI-GISEL-LABEL: v_test_i32_x_sub_neg16:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -729,70 +729,70 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add
;
; GFX9-SDAG-LABEL: v_test_i32_x_sub_neg16:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_add_u32_e32 v1, 16, v1
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_i32_x_sub_neg16:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, -16, v1
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: v_test_i32_x_sub_neg16:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: v_add_nc_u32_e32 v1, 16, v1
-; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-SDAG-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: v_test_i32_x_sub_neg16:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, -16, v1
-; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_i32_x_sub_neg16:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v1, 16, v1
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_i32_x_sub_neg16:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_subrev_nc_u32_e32 v1, -16, v1
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -841,15 +841,15 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add
;
; VI-SDAG-LABEL: v_test_i32_neg16_sub_x:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_sub_u32_e32 v2, vcc, -16, v3
@@ -858,16 +858,16 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add
;
; VI-GISEL-LABEL: v_test_i32_neg16_sub_x:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -877,35 +877,35 @@ define amdgpu_kernel void @v_test_i32_neg16_sub_x(ptr addrspace(1) %out, ptr add
;
; GFX9-LABEL: v_test_i32_neg16_sub_x:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u32_e32 v1, -16, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i32_neg16_sub_x:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_sub_nc_u32_e32 v1, -16, v1
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_i32_neg16_sub_x:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_sub_nc_u32_e32 v1, -16, v1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -954,15 +954,15 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add
;
; VI-SDAG-LABEL: v_test_i32_x_sub_neg17:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u32_e32 v2, vcc, 17, v3
@@ -971,16 +971,16 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add
;
; VI-GISEL-LABEL: v_test_i32_x_sub_neg17:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -990,70 +990,70 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add
;
; GFX9-SDAG-LABEL: v_test_i32_x_sub_neg17:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_add_u32_e32 v1, 17, v1
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_i32_x_sub_neg17:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 0xffffffef, v1
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: v_test_i32_x_sub_neg17:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: v_add_nc_u32_e32 v1, 17, v1
-; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-SDAG-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: v_test_i32_x_sub_neg17:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 0xffffffef, v1
-; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_i32_x_sub_neg17:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v1, 17, v1
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_i32_x_sub_neg17:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 0xffffffef, v1
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -1102,15 +1102,15 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add
;
; VI-SDAG-LABEL: v_test_i32_neg17_sub_x:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_sub_u32_e32 v2, vcc, 0xffffffef, v3
@@ -1119,16 +1119,16 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add
;
; VI-GISEL-LABEL: v_test_i32_neg17_sub_x:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -1138,35 +1138,35 @@ define amdgpu_kernel void @v_test_i32_neg17_sub_x(ptr addrspace(1) %out, ptr add
;
; GFX9-LABEL: v_test_i32_neg17_sub_x:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u32_e32 v1, 0xffffffef, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i32_neg17_sub_x:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0xffffffef, v1
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_i32_neg17_sub_x:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0xffffffef, v1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1270,15 +1270,15 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp
;
; VI-SDAG-LABEL: v_test_i16_x_sub_64:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_ushort v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_subrev_u16_e32 v2, 64, v3
@@ -1287,16 +1287,16 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp
;
; VI-GISEL-LABEL: v_test_i16_x_sub_64:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_ushort v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -1306,35 +1306,35 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp
;
; GFX9-LABEL: v_test_i16_x_sub_64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v1
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i16_x_sub_64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3]
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_sub_nc_u16 v1, v1, 64
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_i16_x_sub_64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_sub_nc_u16 v1, v1, 64
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1387,16 +1387,16 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out
;
; VI-SDAG-LABEL: v_test_i16_x_sub_64_zext_to_i32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v2, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v1, vcc, s2, v1
+; VI-SDAG-NEXT: v_mov_b32_e32 v2, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; VI-SDAG-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-SDAG-NEXT: flat_load_ushort v2, v[1:2]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_subrev_u16_e32 v2, 64, v2
@@ -1405,17 +1405,17 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out
;
; VI-GISEL-LABEL: v_test_i16_x_sub_64_zext_to_i32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 1, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v2, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v2, s7
; VI-GISEL-NEXT: v_add_u32_e32 v1, vcc, v1, v3
; VI-GISEL-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; VI-GISEL-NEXT: flat_load_ushort v2, v[1:2]
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 2, v0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v3
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -1425,41 +1425,41 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out
;
; GFX9-LABEL: v_test_i16_x_sub_64_zext_to_i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v1, s[2:3]
+; GFX9-NEXT: global_load_ushort v1, v1, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i16_x_sub_64_zext_to_i32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v1, s[2:3]
+; GFX10-NEXT: global_load_ushort v1, v1, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_sub_nc_u16 v1, v1, 64
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_i16_x_sub_64_zext_to_i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v1, s[2:3]
+; GFX11-NEXT: global_load_u16 v1, v1, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_sub_nc_u16 v1, v1, 64
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1521,18 +1521,18 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out,
;
; VI-SDAG-LABEL: v_test_i16_x_sub_64_multi_use:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_ushort v3, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: flat_load_ushort v4, v[0:1] glc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: v_subrev_u16_e32 v2, 64, v3
; VI-SDAG-NEXT: v_subrev_u16_e32 v3, 64, v4
@@ -1544,19 +1544,19 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out,
;
; VI-GISEL-LABEL: v_test_i16_x_sub_64_multi_use:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_ushort v3, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: flat_load_ushort v4, v[0:1] glc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: v_subrev_u16_e32 v2, 64, v3
@@ -1569,52 +1569,52 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_test_i16_x_sub_64_multi_use:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v1
; GFX9-NEXT: v_subrev_u16_e32 v2, 64, v2
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_store_short v0, v2, s[0:1]
+; GFX9-NEXT: global_store_short v0, v2, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i16_x_sub_64_multi_use:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v1, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_sub_nc_u16 v1, v1, 64
; GFX10-NEXT: v_sub_nc_u16 v2, v2, 64
-; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_store_short v0, v2, s[0:1]
+; GFX10-NEXT: global_store_short v0, v2, s[4:5]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_i16_x_sub_64_multi_use:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_u16 v2, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_sub_nc_u16 v1, v1, 64
; GFX11-NEXT: v_sub_nc_u16 v2, v2, 64
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] dlc
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5] dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_store_b16 v0, v2, s[0:1] dlc
+; GFX11-NEXT: global_store_b16 v0, v2, s[4:5] dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -1676,16 +1676,16 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a
;
; VI-SDAG-LABEL: v_test_v2i16_x_sub_64_64:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v4, 64
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_sub_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -1696,17 +1696,17 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a
;
; VI-GISEL-LABEL: v_test_v2i16_x_sub_64_64:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v4, 64
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -1718,35 +1718,35 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a
;
; GFX9-LABEL: v_test_v2i16_x_sub_64_64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_i16 v1, v1, 64 op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_sub_64_64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_sub_i16 v1, v1, 64 op_sel_hi:[1,0]
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_sub_64_64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_i16 v1, v1, 64 op_sel_hi:[1,0]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1804,16 +1804,16 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad
;
; VI-SDAG-LABEL: v_test_v2i16_x_sub_7_64:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v4, 64
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u16_e32 v2, -7, v3
@@ -1824,17 +1824,17 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad
;
; VI-GISEL-LABEL: v_test_v2i16_x_sub_7_64:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v4, 64
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -1846,48 +1846,48 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad
;
; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_7_64:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_mov_b32 s0, 0x400007
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT: s_mov_b32 s2, 0x400007
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s2
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s0
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_7_64:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x400007
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_pk_sub_i16 v1, v1, v2
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_sub_7_64:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0x400007
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_sub_7_64:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_i16 v1, v1, 0x400007
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1945,16 +1945,16 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr
;
; VI-SDAG-LABEL: v_test_v2i16_x_sub_64_123:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffff85
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -1965,17 +1965,17 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr
;
; VI-GISEL-LABEL: v_test_v2i16_x_sub_64_123:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x7b
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -1987,48 +1987,48 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr
;
; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_64_123:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_mov_b32 s0, 0x7b0040
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT: s_mov_b32 s2, 0x7b0040
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s2
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s0
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_64_123:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x7b0040
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_pk_sub_i16 v1, v1, v2
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_sub_64_123:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0x7b0040
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_sub_64_123:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_i16 v1, v1, 0x7b0040
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2084,15 +2084,15 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add
;
; VI-SDAG-LABEL: v_test_v2i16_x_sub_7_0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
@@ -2103,17 +2103,17 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add
;
; VI-GISEL-LABEL: v_test_v2i16_x_sub_7_0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v3
@@ -2125,35 +2125,35 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add
;
; GFX9-LABEL: v_test_v2i16_x_sub_7_0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_i16 v1, v1, 7
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_sub_7_0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_sub_i16 v1, v1, 7
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_sub_7_0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_i16 v1, v1, 7
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2208,16 +2208,16 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad
;
; VI-SDAG-LABEL: v_test_v2i16_x_sub_0_16:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_mov_b32_e32 v2, -16
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -2227,18 +2227,18 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad
;
; VI-GISEL-LABEL: v_test_v2i16_x_sub_0_16:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_mov_b32_e32 v2, 16
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_sub_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -2248,35 +2248,35 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad
;
; GFX9-LABEL: v_test_v2i16_x_sub_0_16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_i16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_sub_0_16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_sub_i16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_sub_0_16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_i16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2330,16 +2330,16 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a
;
; VI-SDAG-LABEL: v_test_v2i16_x_sub_0_1_0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0x3c00
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -2349,18 +2349,18 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a
;
; VI-GISEL-LABEL: v_test_v2i16_x_sub_0_1_0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0xffffc400
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_sub_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -2370,48 +2370,48 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a
;
; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_0_1_0:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_brev_b32 s0, 35
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT: s_brev_b32 s2, 35
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s2
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s0
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_0_1_0:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: v_bfrev_b32_e32 v2, 35
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_pk_sub_i16 v1, v1, v2
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_sub_0_1_0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0xc4000000
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_sub_0_1_0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_i16 v1, v1, 0xc4000000
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2465,16 +2465,16 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt
;
; VI-SDAG-LABEL: v_test_v2i16_x_sub_0_neg1_0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0xffffbc00
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -2484,18 +2484,18 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt
;
; VI-GISEL-LABEL: v_test_v2i16_x_sub_0_neg1_0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4400
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_sub_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -2505,48 +2505,48 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt
;
; GFX9-SDAG-LABEL: v_test_v2i16_x_sub_0_neg1_0:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_brev_b32 s0, 34
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT: s_brev_b32 s2, 34
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s2
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: v_pk_sub_i16 v1, v1, s0
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_v2i16_x_sub_0_neg1_0:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: v_bfrev_b32_e32 v2, 34
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_pk_sub_i16 v1, v1, v2
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_sub_0_neg1_0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0x44000000
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_sub_0_neg1_0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_i16 v1, v1, 0x44000000
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2605,16 +2605,16 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out,
;
; VI-SDAG-LABEL: v_test_v2i16_x_add_neg32_neg32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v4, 32
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_sub_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -2625,17 +2625,17 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out,
;
; VI-GISEL-LABEL: v_test_v2i16_x_add_neg32_neg32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffffe0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -2647,35 +2647,35 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_test_v2i16_x_add_neg32_neg32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_add_neg32_neg32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0]
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_add_neg32_neg32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2729,16 +2729,16 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr
;
; VI-SDAG-LABEL: v_test_v2i16_x_add_0_neg32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_mov_b32_e32 v2, 32
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_sub_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -2748,18 +2748,18 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr
;
; VI-GISEL-LABEL: v_test_v2i16_x_add_0_neg32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0xffffffe0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -2769,35 +2769,35 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_test_v2i16_x_add_0_neg32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_add_0_neg32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_add_0_neg32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2852,15 +2852,15 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr
;
; VI-SDAG-LABEL: v_test_v2i16_x_add_neg32_0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
@@ -2871,17 +2871,17 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr
;
; VI-GISEL-LABEL: v_test_v2i16_x_add_neg32_0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v3
@@ -2893,35 +2893,35 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_test_v2i16_x_add_neg32_0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_add_neg32_0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_add_neg32_0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_u16 v1, v1, 32
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2980,16 +2980,16 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out,
;
; VI-SDAG-LABEL: v_test_v2i16_x_add_neg16_neg16:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v4, -16
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u16_e32 v2, -16, v3
@@ -3000,17 +3000,17 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out,
;
; VI-GISEL-LABEL: v_test_v2i16_x_add_neg16_neg16:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v4, -16
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -3022,35 +3022,35 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_test_v2i16_x_add_neg16_neg16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_u16 v1, v1, -16 op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_add_neg16_neg16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_u16 v1, v1, -16 op_sel_hi:[1,0]
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_add_neg16_neg16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v1, v1, -16 op_sel_hi:[1,0]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -3104,16 +3104,16 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr
;
; VI-SDAG-LABEL: v_test_v2i16_x_add_0_neg16:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_mov_b32_e32 v2, -16
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -3123,18 +3123,18 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr
;
; VI-GISEL-LABEL: v_test_v2i16_x_add_0_neg16:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_mov_b32_e32 v2, -16
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -3144,35 +3144,35 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_test_v2i16_x_add_0_neg16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_add_0_neg16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_add_0_neg16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -3227,15 +3227,15 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr
;
; VI-SDAG-LABEL: v_test_v2i16_x_add_neg16_0:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
@@ -3246,17 +3246,17 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr
;
; VI-GISEL-LABEL: v_test_v2i16_x_add_neg16_0:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v3
@@ -3268,35 +3268,35 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(ptr addrspace(1) %out, ptr
;
; GFX9-LABEL: v_test_v2i16_x_add_neg16_0:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_u16 v1, v1, 16
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_add_neg16_0:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_sub_u16 v1, v1, 16
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_add_neg16_0:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_u16 v1, v1, 16
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -3354,16 +3354,16 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p
;
; VI-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffc400
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u16_e32 v2, 0xc400, v3
@@ -3374,17 +3374,17 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p
;
; VI-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffc400
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -3396,72 +3396,72 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p
;
; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_movk_i32 s0, 0xc400
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT: s_movk_i32 s2, 0xc400
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, s2 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, s0 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xc400c400
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v1, v2
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: v_pk_add_u16 v1, 0xffffc400, v1 op_sel_hi:[0,1]
-; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-SDAG-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_pk_add_u16 v1, 0xc400c400, v1
-; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_pk_add_u16 v1, 0xffffc400, v1 op_sel_hi:[0,1]
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_pk_add_u16 v1, 0xc400c400, v1
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -3519,16 +3519,16 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out
;
; VI-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x4400
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u16_e32 v2, 0x4400, v3
@@ -3539,17 +3539,17 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out
;
; VI-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x4400
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -3561,72 +3561,72 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out
;
; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT: s_movk_i32 s0, 0x4400
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT: s_movk_i32 s2, 0x4400
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, s2 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, s0 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0x44004400
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v1, v2
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: v_pk_add_u16 v1, 0x4400, v1 op_sel_hi:[0,1]
-; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-SDAG-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_pk_add_u16 v1, 0x44004400, v1
-; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_pk_add_u16 v1, 0x4400, v1 op_sel_hi:[0,1]
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_pk_add_u16 v1, 0x44004400, v1
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
@@ -3684,16 +3684,16 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p
;
; VI-SDAG-LABEL: v_test_v2i16_x_add_neg_fptwo:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0x4000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u16_e32 v2, 0x4000, v3
@@ -3704,17 +3704,17 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p
;
; VI-GISEL-LABEL: v_test_v2i16_x_add_neg_fptwo:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x4000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -3726,35 +3726,35 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p
;
; GFX9-LABEL: v_test_v2i16_x_add_neg_fptwo:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_u16 v1, v1, 2.0 op_sel:[0,1]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_add_neg_fptwo:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_u16 v1, v1, 2.0 op_sel:[0,1]
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_add_neg_fptwo:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v1, v1, 2.0 op_sel:[0,1]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -3812,16 +3812,16 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out
;
; VI-SDAG-LABEL: v_test_v2i16_x_add_neg_negfptwo:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffc000
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_add_u16_e32 v2, 0xc000, v3
@@ -3832,17 +3832,17 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out
;
; VI-GISEL-LABEL: v_test_v2i16_x_add_neg_negfptwo:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffc000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -3854,35 +3854,35 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out
;
; GFX9-LABEL: v_test_v2i16_x_add_neg_negfptwo:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_u16 v1, v1, -2.0 op_sel:[0,1]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_add_neg_negfptwo:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_add_u16 v1, v1, -2.0 op_sel:[0,1]
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_add_neg_negfptwo:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v1, v1, -2.0 op_sel:[0,1]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -3935,15 +3935,15 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out,
;
; VI-SDAG-LABEL: v_test_v2i16_x_add_undef_neg32:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_mov_b32_e32 v2, 32
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -3953,19 +3953,19 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out,
;
; VI-GISEL-LABEL: v_test_v2i16_x_add_undef_neg32:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-GISEL-NEXT: s_and_b32 s0, 0xffff, s0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0xffffffe0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
-; VI-GISEL-NEXT: s_and_b32 s0, 0xffff, s0
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@@ -3975,35 +3975,35 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out,
;
; GFX9-LABEL: v_test_v2i16_x_add_undef_neg32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_v2i16_x_add_undef_neg32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_v2i16_x_add_undef_neg32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -4054,15 +4054,15 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out,
;
; VI-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef:
; VI-SDAG: ; %bb.0:
-; VI-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-SDAG-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s3
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s7
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: flat_load_dword v3, v[0:1]
-; VI-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-SDAG-NEXT: v_mov_b32_e32 v1, s5
+; VI-SDAG-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-SDAG-NEXT: s_waitcnt vmcnt(0)
; VI-SDAG-NEXT: v_subrev_u16_e32 v2, 32, v3
@@ -4071,19 +4071,19 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out,
;
; VI-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef:
; VI-GISEL: ; %bb.0:
-; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-GISEL-NEXT: s_and_b32 s0, 0xffff, s0
+; VI-GISEL-NEXT: s_lshl_b32 s0, s0, 16
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s6
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s7
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: flat_load_dword v3, v[0:1]
-; VI-GISEL-NEXT: s_and_b32 s2, 0xffff, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; VI-GISEL-NEXT: v_mov_b32_e32 v0, s4
+; VI-GISEL-NEXT: v_mov_b32_e32 v1, s5
; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2
-; VI-GISEL-NEXT: s_lshl_b32 s0, s2, 16
; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-GISEL-NEXT: s_waitcnt vmcnt(0)
; VI-GISEL-NEXT: v_add_u16_e32 v2, 0xffe0, v3
@@ -4093,71 +4093,71 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_undef(ptr addrspace(1) %out,
;
; GFX9-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef:
; GFX9-SDAG: ; %bb.0:
-; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_pk_sub_u16 v1, v1, 32
-; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-SDAG-NEXT: s_endpgm
;
; GFX9-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef:
; GFX9-GISEL: ; %bb.0:
-; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffffffe0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v1, v2
-; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-GISEL-NEXT: s_endpgm
;
; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef:
; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-SDAG-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: v_pk_sub_u16 v1, v1, 32
-; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-SDAG-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef:
; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_pk_add_u16 v1, 0xffffffe0, v1
-; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-GISEL-NEXT: s_endpgm
;
; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg32_undef:
; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_pk_sub_u16 v1, v1, 32
-; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-SDAG-NEXT: s_nop 0
; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-SDAG-NEXT: s_endpgm
;
; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg32_undef:
; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-GISEL-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_pk_add_u16 v1, 0xffffffe0, v1
-; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-GISEL-NEXT: s_nop 0
; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
index 1ab63762ecbd7..3dcdfebf1d2a2 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
@@ -45,12 +45,12 @@ define amdgpu_kernel void @break_inserted_outside_of_loop(ptr addrspace(1) %out,
; FLAT-NEXT: s_cbranch_execnz .LBB0_1
; FLAT-NEXT: ; %bb.2: ; %ENDLOOP
; FLAT-NEXT: s_or_b64 exec, exec, s[2:3]
-; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; FLAT-NEXT: s_mov_b32 s3, 0xf000
-; FLAT-NEXT: s_mov_b32 s2, -1
+; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; FLAT-NEXT: s_mov_b32 s7, 0xf000
+; FLAT-NEXT: s_mov_b32 s6, -1
; FLAT-NEXT: v_mov_b32_e32 v0, 0
; FLAT-NEXT: s_waitcnt lgkmcnt(0)
-; FLAT-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; FLAT-NEXT: buffer_store_dword v0, off, s[4:7], 0
; FLAT-NEXT: s_endpgm
main_body:
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll
index 33249e4faeccd..1aa3da9a90600 100644
--- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll
+++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll
@@ -130,15 +130,15 @@ define amdgpu_kernel void @s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a) noun
;
; VI-LABEL: s_sext_i32_to_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ashr_i32 s5, s4, 31
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_ashr_i32 s0, s2, 31
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%sext = sext i32 %a to i64
store i64 %sext, ptr addrspace(1) %out, align 8
@@ -166,20 +166,20 @@ define amdgpu_kernel void @v_sext_i32_to_i64(ptr addrspace(1) %out, ptr addrspac
;
; VI-LABEL: v_sext_i32_to_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
%val = load i32, ptr addrspace(1) %in, align 4
%sext = sext i32 %val to i64
@@ -203,15 +203,15 @@ define amdgpu_kernel void @s_sext_i16_to_i64(ptr addrspace(1) %out, i16 %a) noun
;
; VI-LABEL: s_sext_i16_to_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x100000
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%sext = sext i16 %a to i64
store i64 %sext, ptr addrspace(1) %out, align 8
@@ -276,17 +276,17 @@ define amdgpu_kernel void @s_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32
; VI-LABEL: s_sext_i1_to_i16_with_and:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s11, 0xf000
+; VI-NEXT: s_mov_b32 s10, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s4, s5
-; VI-NEXT: s_cselect_b64 s[4:5], -1, 0
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: s_cmp_eq_u32 s6, s7
-; VI-NEXT: s_cselect_b64 s[6:7], -1, 0
-; VI-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
-; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
-; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
+; VI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
+; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
%cmp0 = icmp eq i32 %a, %b
%cmp1 = icmp eq i32 %c, %d
@@ -375,26 +375,26 @@ define amdgpu_kernel void @s_sext_v4i8_to_v4i32(ptr addrspace(1) %out, i32 %a) n
;
; VI-LABEL: s_sext_v4i8_to_v4i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_lshrrev_b16_e64 v0, 8, s4
-; VI-NEXT: s_ashr_i32 s5, s4, 24
-; VI-NEXT: s_bfe_i32 s6, s4, 0x80010
-; VI-NEXT: s_sext_i32_i8 s4, s4
+; VI-NEXT: v_lshrrev_b16_e64 v0, 8, s2
+; VI-NEXT: s_ashr_i32 s0, s2, 24
+; VI-NEXT: s_bfe_i32 s1, s2, 0x80010
+; VI-NEXT: s_sext_i32_i8 s2, s2
; VI-NEXT: v_bfe_i32 v0, v0, 0, 8
-; VI-NEXT: v_mov_b32_e32 v1, s4
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v1, s2
+; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s6
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s1
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s5
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%cast = bitcast i32 %a to <4 x i8>
@@ -443,30 +443,30 @@ define amdgpu_kernel void @v_sext_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrs
;
; VI-LABEL: v_sext_v4i8_to_v4i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b16_e32 v1, 8, v0
; VI-NEXT: v_ashrrev_i32_e32 v2, 24, v0
; VI-NEXT: v_bfe_i32 v3, v0, 16, 8
; VI-NEXT: v_bfe_i32 v0, v0, 0, 8
; VI-NEXT: v_bfe_i32 v1, v1, 0, 8
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%a = load i32, ptr addrspace(1) %in
@@ -513,27 +513,27 @@ define amdgpu_kernel void @s_sext_v4i16_to_v4i32(ptr addrspace(1) %out, i64 %a)
;
; VI-LABEL: s_sext_v4i16_to_v4i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: s_ashr_i32 s1, s2, 16
-; VI-NEXT: s_sext_i32_i16 s2, s2
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: s_ashr_i32 s0, s3, 16
-; VI-NEXT: s_sext_i32_i16 s3, s3
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: s_ashr_i32 s5, s6, 16
+; VI-NEXT: s_sext_i32_i16 s6, s6
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: s_ashr_i32 s4, s7, 16
+; VI-NEXT: s_sext_i32_i16 s7, s7
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: v_mov_b32_e32 v0, s5
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s3
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: v_mov_b32_e32 v0, s7
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%cast = bitcast i64 %a to <4 x i16>
@@ -580,29 +580,29 @@ define amdgpu_kernel void @v_sext_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addr
;
; VI-LABEL: v_sext_v4i16_to_v4i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ashrrev_i32_e32 v3, 16, v0
; VI-NEXT: v_bfe_i32 v0, v0, 0, 16
; VI-NEXT: v_ashrrev_i32_e32 v2, 16, v1
; VI-NEXT: v_bfe_i32 v1, v1, 0, 16
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v3, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v2, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
%a = load i64, ptr addrspace(1) %in
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
index 539cfc71a80f9..4770a356a838e 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
@@ -42,25 +42,25 @@ define amdgpu_kernel void @test_simple_indirect_call() {
;
; GFX9-LABEL: test_simple_indirect_call:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x4
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x4
; GFX9-NEXT: s_add_u32 flat_scratch_lo, s10, s15
; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s11, 0
; GFX9-NEXT: s_add_u32 s0, s0, s15
; GFX9-NEXT: s_addc_u32 s1, s1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshr_b32 s4, s4, 16
-; GFX9-NEXT: s_mul_i32 s4, s4, s5
+; GFX9-NEXT: s_lshr_b32 s4, s6, 16
+; GFX9-NEXT: s_mul_i32 s4, s4, s7
; GFX9-NEXT: v_mul_lo_u32 v0, s4, v0
-; GFX9-NEXT: s_getpc_b64 s[6:7]
-; GFX9-NEXT: s_add_u32 s6, s6, indirect at rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s7, s7, indirect at rel32@hi+12
-; GFX9-NEXT: v_mov_b32_e32 v3, s6
-; GFX9-NEXT: v_mov_b32_e32 v4, s7
-; GFX9-NEXT: v_mad_u32_u24 v0, v1, s5, v0
+; GFX9-NEXT: s_getpc_b64 s[4:5]
+; GFX9-NEXT: s_add_u32 s4, s4, indirect at rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s5, s5, indirect at rel32@hi+12
+; GFX9-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-NEXT: v_mov_b32_e32 v4, s5
+; GFX9-NEXT: v_mad_u32_u24 v0, v1, s7, v0
; GFX9-NEXT: v_add_lshl_u32 v0, v0, v2, 3
; GFX9-NEXT: s_mov_b32 s32, 0
; GFX9-NEXT: ds_write_b64 v0, v[3:4]
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: s_endpgm
%fptr = alloca ptr, addrspace(5)
%fptr.cast = addrspacecast ptr addrspace(5) %fptr to ptr
diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
index b03726817c1b4..ba0f4c8dbe360 100644
--- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
@@ -32,50 +32,50 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %i
;
; GFX8-LABEL: s_sint_to_fp_i64_to_f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_xor_b32 s5, s2, s3
-; GFX8-NEXT: s_flbit_i32 s4, s3
-; GFX8-NEXT: s_ashr_i32 s5, s5, 31
-; GFX8-NEXT: s_add_i32 s4, s4, -1
-; GFX8-NEXT: s_add_i32 s5, s5, 32
-; GFX8-NEXT: s_min_u32 s4, s4, s5
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
-; GFX8-NEXT: s_min_u32 s2, s2, 1
-; GFX8-NEXT: s_or_b32 s2, s3, s2
-; GFX8-NEXT: v_cvt_f32_i32_e32 v0, s2
-; GFX8-NEXT: s_sub_i32 s2, 32, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_ldexp_f32 v0, v0, s2
+; GFX8-NEXT: s_xor_b32 s1, s6, s7
+; GFX8-NEXT: s_flbit_i32 s0, s7
+; GFX8-NEXT: s_ashr_i32 s1, s1, 31
+; GFX8-NEXT: s_add_i32 s0, s0, -1
+; GFX8-NEXT: s_add_i32 s1, s1, 32
+; GFX8-NEXT: s_min_u32 s2, s0, s1
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[6:7], s2
+; GFX8-NEXT: s_min_u32 s0, s0, 1
+; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: v_cvt_f32_i32_e32 v0, s0
+; GFX8-NEXT: s_sub_i32 s0, 32, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_ldexp_f32 v0, v0, s0
; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX11-LABEL: s_sint_to_fp_i64_to_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_xor_b32 s4, s2, s3
-; GFX11-NEXT: s_cls_i32 s5, s3
-; GFX11-NEXT: s_ashr_i32 s4, s4, 31
-; GFX11-NEXT: s_add_i32 s5, s5, -1
-; GFX11-NEXT: s_add_i32 s4, s4, 32
+; GFX11-NEXT: s_xor_b32 s0, s6, s7
+; GFX11-NEXT: s_cls_i32 s1, s7
+; GFX11-NEXT: s_ashr_i32 s0, s0, 31
+; GFX11-NEXT: s_add_i32 s1, s1, -1
+; GFX11-NEXT: s_add_i32 s0, s0, 32
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_min_u32 s4, s5, s4
-; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
+; GFX11-NEXT: s_min_u32 s2, s1, s0
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[6:7], s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_min_u32 s2, s2, 1
-; GFX11-NEXT: s_or_b32 s2, s3, s2
+; GFX11-NEXT: s_min_u32 s0, s0, 1
+; GFX11-NEXT: s_or_b32 s0, s1, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2
-; GFX11-NEXT: s_sub_i32 s2, 32, s4
+; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s0
+; GFX11-NEXT: s_sub_i32 s0, 32, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_ldexp_f32 v0, v0, s2
+; GFX11-NEXT: v_ldexp_f32 v0, v0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -116,12 +116,12 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad
;
; GFX8-LABEL: v_sint_to_fp_i64_to_f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -136,8 +136,8 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad
; GFX8-NEXT: v_min_u32_e32 v1, 1, v1
; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
; GFX8-NEXT: v_cvt_f32_i32_e32 v1, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v2, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; GFX8-NEXT: v_ldexp_f32 v1, v1, v3
; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v1
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
@@ -146,11 +146,11 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad
;
; GFX11-LABEL: v_sint_to_fp_i64_to_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3]
+; GFX11-NEXT: global_load_b64 v[1:2], v1, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_xor_b32_e32 v3, v1, v2
; GFX11-NEXT: v_cls_i32_e32 v4, v2
@@ -170,7 +170,7 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_ldexp_f32 v1, v1, v2
; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -209,47 +209,47 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %i
;
; GFX8-LABEL: s_sint_to_fp_i64_to_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_xor_b32 s5, s2, s3
-; GFX8-NEXT: s_flbit_i32 s4, s3
-; GFX8-NEXT: s_ashr_i32 s5, s5, 31
-; GFX8-NEXT: s_add_i32 s4, s4, -1
-; GFX8-NEXT: s_add_i32 s5, s5, 32
-; GFX8-NEXT: s_min_u32 s4, s4, s5
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
-; GFX8-NEXT: s_min_u32 s2, s2, 1
-; GFX8-NEXT: s_or_b32 s2, s3, s2
-; GFX8-NEXT: v_cvt_f32_i32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: s_sub_i32 s0, 32, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_xor_b32 s1, s6, s7
+; GFX8-NEXT: s_flbit_i32 s0, s7
+; GFX8-NEXT: s_ashr_i32 s1, s1, 31
+; GFX8-NEXT: s_add_i32 s0, s0, -1
+; GFX8-NEXT: s_add_i32 s1, s1, 32
+; GFX8-NEXT: s_min_u32 s2, s0, s1
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[6:7], s2
+; GFX8-NEXT: s_min_u32 s0, s0, 1
+; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: v_cvt_f32_i32_e32 v2, s0
+; GFX8-NEXT: s_sub_i32 s0, 32, s2
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_ldexp_f32 v2, v2, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX11-LABEL: s_sint_to_fp_i64_to_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_xor_b32 s4, s2, s3
-; GFX11-NEXT: s_cls_i32 s5, s3
-; GFX11-NEXT: s_ashr_i32 s4, s4, 31
-; GFX11-NEXT: s_add_i32 s5, s5, -1
-; GFX11-NEXT: s_add_i32 s4, s4, 32
+; GFX11-NEXT: s_xor_b32 s0, s6, s7
+; GFX11-NEXT: s_cls_i32 s1, s7
+; GFX11-NEXT: s_ashr_i32 s0, s0, 31
+; GFX11-NEXT: s_add_i32 s1, s1, -1
+; GFX11-NEXT: s_add_i32 s0, s0, 32
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_min_u32 s4, s5, s4
-; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
+; GFX11-NEXT: s_min_u32 s2, s1, s0
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[6:7], s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_min_u32 s2, s2, 1
-; GFX11-NEXT: s_or_b32 s2, s3, s2
+; GFX11-NEXT: s_min_u32 s0, s0, 1
+; GFX11-NEXT: s_or_b32 s0, s1, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2
-; GFX11-NEXT: s_sub_i32 s2, 32, s4
+; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s0
+; GFX11-NEXT: s_sub_i32 s0, 32, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_ldexp_f32 v0, v0, s2
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_ldexp_f32 v0, v0, s0
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -289,12 +289,12 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad
;
; GFX8-LABEL: v_sint_to_fp_i64_to_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -305,11 +305,11 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 32, v0
; GFX8-NEXT: v_min_u32_e32 v4, v4, v0
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v4, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s5
; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: v_cvt_f32_i32_e32 v5, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v3
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
; GFX8-NEXT: v_ldexp_f32 v2, v5, v2
@@ -318,11 +318,11 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad
;
; GFX11-LABEL: v_sint_to_fp_i64_to_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3]
+; GFX11-NEXT: global_load_b64 v[1:2], v1, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_xor_b32_e32 v3, v1, v2
; GFX11-NEXT: v_cls_i32_e32 v4, v2
@@ -341,7 +341,7 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad
; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_ldexp_f32 v1, v1, v2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -392,34 +392,34 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2
; GFX8-LABEL: s_sint_to_fp_v2i64_to_v2f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_xor_b32 s3, s6, s7
-; GFX8-NEXT: s_flbit_i32 s2, s7
-; GFX8-NEXT: s_ashr_i32 s3, s3, 31
-; GFX8-NEXT: s_add_i32 s2, s2, -1
-; GFX8-NEXT: s_add_i32 s3, s3, 32
-; GFX8-NEXT: s_min_u32 s9, s2, s3
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s9
-; GFX8-NEXT: s_min_u32 s2, s2, 1
-; GFX8-NEXT: s_or_b32 s2, s3, s2
-; GFX8-NEXT: v_cvt_f32_i32_e32 v0, s2
-; GFX8-NEXT: s_xor_b32 s2, s4, s5
+; GFX8-NEXT: s_xor_b32 s1, s6, s7
+; GFX8-NEXT: s_flbit_i32 s0, s7
+; GFX8-NEXT: s_ashr_i32 s1, s1, 31
+; GFX8-NEXT: s_add_i32 s0, s0, -1
+; GFX8-NEXT: s_add_i32 s1, s1, 32
+; GFX8-NEXT: s_min_u32 s9, s0, s1
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[6:7], s9
+; GFX8-NEXT: s_min_u32 s0, s0, 1
+; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: v_cvt_f32_i32_e32 v0, s0
+; GFX8-NEXT: s_xor_b32 s0, s4, s5
; GFX8-NEXT: s_flbit_i32 s8, s5
-; GFX8-NEXT: s_ashr_i32 s2, s2, 31
+; GFX8-NEXT: s_ashr_i32 s0, s0, 31
; GFX8-NEXT: s_add_i32 s8, s8, -1
-; GFX8-NEXT: s_add_i32 s2, s2, 32
-; GFX8-NEXT: s_min_u32 s6, s8, s2
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], s6
-; GFX8-NEXT: s_min_u32 s2, s2, 1
-; GFX8-NEXT: s_or_b32 s2, s3, s2
-; GFX8-NEXT: v_cvt_f32_i32_e32 v2, s2
-; GFX8-NEXT: s_sub_i32 s2, 32, s9
-; GFX8-NEXT: v_ldexp_f32 v1, v0, s2
-; GFX8-NEXT: s_sub_i32 s2, 32, s6
-; GFX8-NEXT: v_ldexp_f32 v0, v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_add_i32 s0, s0, 32
+; GFX8-NEXT: s_min_u32 s6, s8, s0
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[4:5], s6
+; GFX8-NEXT: s_min_u32 s0, s0, 1
+; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: v_cvt_f32_i32_e32 v2, s0
+; GFX8-NEXT: s_sub_i32 s0, 32, s9
+; GFX8-NEXT: v_ldexp_f32 v1, v0, s0
+; GFX8-NEXT: s_sub_i32 s0, 32, s6
+; GFX8-NEXT: v_ldexp_f32 v0, v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
@@ -427,35 +427,35 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_xor_b32 s3, s6, s7
+; GFX11-NEXT: s_xor_b32 s1, s6, s7
; GFX11-NEXT: s_xor_b32 s9, s4, s5
-; GFX11-NEXT: s_cls_i32 s2, s7
+; GFX11-NEXT: s_cls_i32 s0, s7
; GFX11-NEXT: s_cls_i32 s8, s5
-; GFX11-NEXT: s_ashr_i32 s3, s3, 31
+; GFX11-NEXT: s_ashr_i32 s1, s1, 31
; GFX11-NEXT: s_ashr_i32 s9, s9, 31
-; GFX11-NEXT: s_add_i32 s2, s2, -1
+; GFX11-NEXT: s_add_i32 s0, s0, -1
; GFX11-NEXT: s_add_i32 s8, s8, -1
-; GFX11-NEXT: s_add_i32 s3, s3, 32
+; GFX11-NEXT: s_add_i32 s1, s1, 32
; GFX11-NEXT: s_add_i32 s9, s9, 32
-; GFX11-NEXT: s_min_u32 s10, s2, s3
+; GFX11-NEXT: s_min_u32 s10, s0, s1
; GFX11-NEXT: s_min_u32 s8, s8, s9
-; GFX11-NEXT: s_lshl_b64 s[2:3], s[6:7], s10
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[6:7], s10
; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s8
-; GFX11-NEXT: s_min_u32 s2, s2, 1
+; GFX11-NEXT: s_min_u32 s0, s0, 1
; GFX11-NEXT: s_min_u32 s4, s4, 1
-; GFX11-NEXT: s_or_b32 s2, s3, s2
-; GFX11-NEXT: s_or_b32 s3, s5, s4
-; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2
-; GFX11-NEXT: v_cvt_f32_i32_e32 v2, s3
-; GFX11-NEXT: s_sub_i32 s2, 32, s10
-; GFX11-NEXT: s_sub_i32 s3, 32, s8
+; GFX11-NEXT: s_or_b32 s0, s1, s0
+; GFX11-NEXT: s_or_b32 s1, s5, s4
+; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s0
+; GFX11-NEXT: v_cvt_f32_i32_e32 v2, s1
+; GFX11-NEXT: s_sub_i32 s0, 32, s10
+; GFX11-NEXT: s_sub_i32 s1, 32, s8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_ldexp_f32 v1, v0, s2
-; GFX11-NEXT: v_ldexp_f32 v0, v2, s3
-; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-NEXT: v_ldexp_f32 v1, v0, s0
+; GFX11-NEXT: v_ldexp_f32 v0, v2, s1
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -534,19 +534,19 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt
;
; GFX8-LABEL: v_sint_to_fp_v4i64_to_v4f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, s2, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s6, v1
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_dwordx4 v[1:4], v[5:6]
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 16, v5
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
; GFX8-NEXT: flat_load_dwordx4 v[5:8], v[5:6]
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v0
-; GFX8-NEXT: v_mov_b32_e32 v10, s1
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v0
+; GFX8-NEXT: v_mov_b32_e32 v10, s5
; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_xor_b32_e32 v0, v3, v4
@@ -603,12 +603,12 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt
;
; GFX11-LABEL: v_sint_to_fp_v4i64_to_v4f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b128 v[1:4], v5, s[2:3] offset:16
-; GFX11-NEXT: global_load_b128 v[5:8], v5, s[2:3]
+; GFX11-NEXT: global_load_b128 v[1:4], v5, s[6:7] offset:16
+; GFX11-NEXT: global_load_b128 v[5:8], v5, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_xor_b32_e32 v9, v3, v4
; GFX11-NEXT: v_xor_b32_e32 v11, v1, v2
@@ -664,7 +664,7 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt
; GFX11-NEXT: v_ldexp_f32 v2, v1, v10
; GFX11-NEXT: v_ldexp_f32 v1, v6, v11
; GFX11-NEXT: v_ldexp_f32 v0, v4, v5
-; GFX11-NEXT: global_store_b128 v7, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b128 v7, v[0:3], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -719,37 +719,37 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2
; GFX8-LABEL: s_sint_to_fp_v2i64_to_v2f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_xor_b32 s3, s6, s7
-; GFX8-NEXT: s_flbit_i32 s2, s7
-; GFX8-NEXT: s_ashr_i32 s3, s3, 31
-; GFX8-NEXT: s_add_i32 s2, s2, -1
-; GFX8-NEXT: s_add_i32 s3, s3, 32
-; GFX8-NEXT: s_min_u32 s8, s2, s3
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s8
-; GFX8-NEXT: s_min_u32 s2, s2, 1
-; GFX8-NEXT: s_or_b32 s2, s3, s2
-; GFX8-NEXT: s_xor_b32 s3, s4, s5
-; GFX8-NEXT: v_cvt_f32_i32_e32 v0, s2
-; GFX8-NEXT: s_flbit_i32 s2, s5
-; GFX8-NEXT: s_ashr_i32 s3, s3, 31
-; GFX8-NEXT: s_add_i32 s2, s2, -1
-; GFX8-NEXT: s_add_i32 s3, s3, 32
-; GFX8-NEXT: s_min_u32 s7, s2, s3
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], s7
-; GFX8-NEXT: s_min_u32 s2, s2, 1
-; GFX8-NEXT: s_or_b32 s2, s3, s2
-; GFX8-NEXT: v_cvt_f32_i32_e32 v1, s2
+; GFX8-NEXT: s_xor_b32 s1, s6, s7
+; GFX8-NEXT: s_flbit_i32 s0, s7
+; GFX8-NEXT: s_ashr_i32 s1, s1, 31
+; GFX8-NEXT: s_add_i32 s0, s0, -1
+; GFX8-NEXT: s_add_i32 s1, s1, 32
+; GFX8-NEXT: s_min_u32 s8, s0, s1
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[6:7], s8
+; GFX8-NEXT: s_min_u32 s0, s0, 1
+; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: s_xor_b32 s1, s4, s5
+; GFX8-NEXT: v_cvt_f32_i32_e32 v0, s0
+; GFX8-NEXT: s_flbit_i32 s0, s5
+; GFX8-NEXT: s_ashr_i32 s1, s1, 31
+; GFX8-NEXT: s_add_i32 s0, s0, -1
+; GFX8-NEXT: s_add_i32 s1, s1, 32
+; GFX8-NEXT: s_min_u32 s7, s0, s1
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[4:5], s7
+; GFX8-NEXT: s_min_u32 s0, s0, 1
+; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: v_cvt_f32_i32_e32 v1, s0
; GFX8-NEXT: s_sub_i32 s6, 32, s8
-; GFX8-NEXT: s_sub_i32 s2, 32, s7
+; GFX8-NEXT: s_sub_i32 s0, 32, s7
; GFX8-NEXT: v_ldexp_f32 v0, v0, s6
-; GFX8-NEXT: v_ldexp_f32 v1, v1, s2
+; GFX8-NEXT: v_ldexp_f32 v1, v1, s0
; GFX8-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-NEXT: v_or_b32_e32 v2, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -757,40 +757,40 @@ define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_xor_b32 s3, s6, s7
+; GFX11-NEXT: s_xor_b32 s1, s6, s7
; GFX11-NEXT: s_xor_b32 s9, s4, s5
-; GFX11-NEXT: s_cls_i32 s2, s7
+; GFX11-NEXT: s_cls_i32 s0, s7
; GFX11-NEXT: s_cls_i32 s8, s5
-; GFX11-NEXT: s_ashr_i32 s3, s3, 31
+; GFX11-NEXT: s_ashr_i32 s1, s1, 31
; GFX11-NEXT: s_ashr_i32 s9, s9, 31
-; GFX11-NEXT: s_add_i32 s2, s2, -1
+; GFX11-NEXT: s_add_i32 s0, s0, -1
; GFX11-NEXT: s_add_i32 s8, s8, -1
-; GFX11-NEXT: s_add_i32 s3, s3, 32
+; GFX11-NEXT: s_add_i32 s1, s1, 32
; GFX11-NEXT: s_add_i32 s9, s9, 32
-; GFX11-NEXT: s_min_u32 s10, s2, s3
+; GFX11-NEXT: s_min_u32 s10, s0, s1
; GFX11-NEXT: s_min_u32 s8, s8, s9
-; GFX11-NEXT: s_lshl_b64 s[2:3], s[6:7], s10
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[6:7], s10
; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s8
-; GFX11-NEXT: s_min_u32 s2, s2, 1
+; GFX11-NEXT: s_min_u32 s0, s0, 1
; GFX11-NEXT: s_min_u32 s4, s4, 1
-; GFX11-NEXT: s_or_b32 s2, s3, s2
-; GFX11-NEXT: s_or_b32 s3, s5, s4
-; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s2
-; GFX11-NEXT: v_cvt_f32_i32_e32 v1, s3
-; GFX11-NEXT: s_sub_i32 s2, 32, s10
-; GFX11-NEXT: s_sub_i32 s3, 32, s8
+; GFX11-NEXT: s_or_b32 s0, s1, s0
+; GFX11-NEXT: s_or_b32 s1, s5, s4
+; GFX11-NEXT: v_cvt_f32_i32_e32 v0, s0
+; GFX11-NEXT: v_cvt_f32_i32_e32 v1, s1
+; GFX11-NEXT: s_sub_i32 s0, 32, s10
+; GFX11-NEXT: s_sub_i32 s1, 32, s8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_ldexp_f32 v0, v0, s2
-; GFX11-NEXT: v_ldexp_f32 v1, v1, s3
+; GFX11-NEXT: v_ldexp_f32 v0, v0, s0
+; GFX11-NEXT: v_ldexp_f32 v1, v1, s1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -877,18 +877,18 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt
;
; GFX8-LABEL: v_sint_to_fp_v4i64_to_v4f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, s2, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s6, v1
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_dwordx4 v[1:4], v[5:6]
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 16, v5
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
; GFX8-NEXT: flat_load_dwordx4 v[5:8], v[5:6]
-; GFX8-NEXT: v_mov_b32_e32 v10, s1
+; GFX8-NEXT: v_mov_b32_e32 v10, s5
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_xor_b32_e32 v0, v3, v4
; GFX8-NEXT: v_xor_b32_e32 v12, v1, v2
@@ -943,7 +943,7 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt
; GFX8-NEXT: v_cvt_f16_f32_e32 v4, v0
; GFX8-NEXT: v_cvt_f16_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; GFX8-NEXT: v_cvt_f16_f32_e32 v6, v2
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v9
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v9
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v10, vcc
; GFX8-NEXT: v_or_b32_e32 v2, v4, v3
; GFX8-NEXT: v_or_b32_e32 v3, v6, v5
@@ -952,12 +952,12 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt
;
; GFX11-LABEL: v_sint_to_fp_v4i64_to_v4f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b128 v[1:4], v5, s[2:3] offset:16
-; GFX11-NEXT: global_load_b128 v[5:8], v5, s[2:3]
+; GFX11-NEXT: global_load_b128 v[1:4], v5, s[6:7] offset:16
+; GFX11-NEXT: global_load_b128 v[5:8], v5, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_xor_b32_e32 v9, v3, v4
; GFX11-NEXT: v_xor_b32_e32 v11, v1, v2
@@ -1022,7 +1022,7 @@ define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
; GFX11-NEXT: v_pack_b32_f16 v0, v4, v2
-; GFX11-NEXT: global_store_b64 v5, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v5, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
index b4b0d960e12e5..838457674bd9f 100644
--- a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
@@ -25,38 +25,38 @@ define amdgpu_kernel void @sitofp_i16_to_f16(
;
; VI-LABEL: sitofp_i16_to_f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f16_i16_e32 v0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: sitofp_i16_to_f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f16_i16_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -91,41 +91,41 @@ define amdgpu_kernel void @sitofp_i32_to_f16(
;
; VI-LABEL: sitofp_i32_to_f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_i32_e32 v0, v0
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: sitofp_i32_to_f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -168,44 +168,44 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16(
;
; VI-LABEL: sitofp_v2i16_to_v2f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f16_i16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: v_cvt_f16_i16_e32 v0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: sitofp_v2i16_to_v2f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f16_i16_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f16_i16_e32 v1, v1
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -244,39 +244,39 @@ define amdgpu_kernel void @sitofp_v2i32_to_v2f16(
;
; VI-LABEL: sitofp_v2i32_to_v2f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_i32_e32 v1, v1
; VI-NEXT: v_cvt_f32_i32_e32 v0, v0
; VI-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: sitofp_v2i32_to_v2f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_i32_e32 v1, v1
; GFX11-NEXT: v_cvt_f32_i32_e32 v0, v0
@@ -285,7 +285,7 @@ define amdgpu_kernel void @sitofp_v2i32_to_v2f16(
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -357,20 +357,19 @@ define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -378,9 +377,10 @@ define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_xor_b32 s0, s0, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index e1bd1523d78a4..e585e6c720457 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -4988,1318 +4988,1317 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX9-FLATSCR-LABEL: test:
; GFX9-FLATSCR: ; %bb.0: ; %entry
; GFX9-FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s2, s5
-; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
-; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
; GFX9-FLATSCR-NEXT: v_lshlrev_b32_e32 v5, 13, v0
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x80
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x80
; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v2, vcc, s2, v5
-; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s3
+; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v2, vcc, s6, v5
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s7
; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v0, vcc
-; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
+; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968
-; GFX9-FLATSCR-NEXT: s_mov_b32 s4, 4
+; GFX9-FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT: s_mov_b32 s4, 20
+; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 20
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT: s_mov_b32 s4, 36
+; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 36
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT: s_mov_b32 s4, 52
+; GFX9-FLATSCR-NEXT: s_mov_b32 s0, 52
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x44
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x44
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x54
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x54
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x64
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x64
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x74
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x74
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x100
-; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x100
+; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x84
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x94
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x94
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x180
-; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x180
+; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x104
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x104
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x114
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x114
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x124
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x124
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x134
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x134
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x144
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x144
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x154
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x154
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x164
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x164
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x174
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x174
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x200
-; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x200
+; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x184
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x184
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x194
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x194
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1a4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1b4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1c4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1d4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1e4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1f4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x280
-; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x280
+; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x204
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x204
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x214
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x214
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x224
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x224
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x234
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x234
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x244
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x244
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x254
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x254
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x264
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x264
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x274
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x274
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x300
-; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x300
+; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x284
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x284
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x294
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x294
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x2a4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x2b4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x2c4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x2d4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x2e4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x2f4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x2f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x380
-; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x380
+; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x304
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x304
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x314
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x314
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x324
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x324
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x334
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x334
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x344
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x344
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x354
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x354
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x364
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x364
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x374
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x374
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x400
-; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s4, v2
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x400
+; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3968
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x384
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x384
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x394
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x394
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x3a4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x3a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x3b4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x3b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x3c4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x3c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x3d4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x3d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:4064
; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x3e4
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x3e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x3f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x3f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3]
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x404
-; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, s1
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7]
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x404
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v6, s5
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:16
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x414
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:16
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x414
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:32
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x424
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:32
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x424
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:48
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x434
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:48
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x434
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:64
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x444
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:64
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x444
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:80
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x454
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:80
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x454
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:96
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x464
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:96
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x464
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:112
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x474
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:112
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x474
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:128
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x484
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:128
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x484
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:144
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x494
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:144
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x494
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:160
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x4a4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:160
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x4a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:176
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x4b4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:176
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x4b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:192
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x4c4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:192
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x4c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:208
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x4d4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:208
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x4d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:224
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x4e4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:224
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x4e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:240
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x4f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:240
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x4f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:256
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x504
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:256
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x504
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:272
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x514
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:272
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x514
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:288
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x524
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:288
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x524
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:304
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x534
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:304
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x534
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:320
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x544
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:320
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x544
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:336
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x554
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:336
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x554
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:352
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x564
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:352
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x564
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:368
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x574
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:368
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x574
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:384
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x584
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:384
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x584
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:400
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x594
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:400
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x594
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:416
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x5a4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:416
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x5a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:432
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x5b4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:432
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x5b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:448
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x5c4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:448
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x5c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:464
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x5d4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:464
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x5d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:480
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x5e4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:480
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x5e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:496
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x5f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:496
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x5f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:512
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x604
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:512
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x604
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:528
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x614
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:528
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x614
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:544
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x624
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:544
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x624
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:560
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x634
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:560
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x634
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:576
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x644
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:576
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x644
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:592
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x654
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:592
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x654
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:608
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x664
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:608
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x664
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:624
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x674
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:624
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x674
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:640
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x684
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:640
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x684
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:656
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x694
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:656
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x694
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:672
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x6a4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:672
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x6a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:688
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x6b4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:688
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x6b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:704
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x6c4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:704
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x6c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:720
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x6d4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:720
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x6d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:736
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x6e4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:736
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x6e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:752
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x6f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:752
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x6f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:768
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x704
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:768
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x704
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:784
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x714
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:784
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x714
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:800
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x724
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:800
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x724
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:816
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x734
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:816
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x734
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:832
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x744
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:832
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x744
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:848
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x754
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:848
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x754
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:864
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x764
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:864
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x764
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:880
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x774
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:880
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x774
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:896
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x784
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:896
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x784
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:912
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x794
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:912
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x794
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:928
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x7a4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:928
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x7a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:944
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x7b4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:944
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x7b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:960
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x7c4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:960
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x7c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:976
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x7d4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:976
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x7d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:992
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x7e4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:992
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x7e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1008
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x7f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1008
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x7f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1024
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x804
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1024
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x804
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1040
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x814
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1040
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x814
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1056
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x824
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1056
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x824
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1072
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x834
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1072
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x834
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1088
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x844
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1088
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x844
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1104
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x854
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1104
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x854
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1120
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x864
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1120
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x864
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1136
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x874
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1136
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x874
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1152
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x884
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1152
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x884
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1168
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x894
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1168
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x894
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1184
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x8a4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1184
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x8a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1200
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x8b4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1200
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x8b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1216
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x8c4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1216
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x8c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1232
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x8d4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1232
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x8d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1248
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x8e4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1248
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x8e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1264
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x8f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1264
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x8f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1280
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x904
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1280
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x904
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1296
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x914
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1296
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x914
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1312
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x924
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1312
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x924
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1328
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x934
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1328
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x934
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1344
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x944
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1344
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x944
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1360
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x954
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1360
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x954
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1376
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x964
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1376
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x964
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1392
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x974
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1392
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x974
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1408
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x984
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1408
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x984
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1424
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x994
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1424
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x994
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1440
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x9a4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1440
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x9a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1456
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x9b4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1456
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x9b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1472
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x9c4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1472
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x9c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1488
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x9d4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1488
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x9d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1504
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x9e4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1504
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x9e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1520
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x9f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1520
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x9f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1536
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa04
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1536
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa04
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1552
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa14
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1552
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa14
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1568
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa24
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1568
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa24
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1584
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa34
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1584
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa34
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1600
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa44
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1600
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa44
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1616
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa54
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1616
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa54
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1632
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa64
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1632
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa64
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1648
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa74
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1648
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa74
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1664
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa84
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1664
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1680
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xa94
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1680
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa94
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1696
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xaa4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1696
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xaa4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1712
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xab4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1712
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xab4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1728
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xac4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1728
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xac4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1744
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xad4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1744
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xad4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1760
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xae4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1760
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xae4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1776
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xaf4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1776
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xaf4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1792
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb04
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1792
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb04
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1808
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb14
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1808
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb14
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1824
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb24
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1824
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb24
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1840
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb34
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1840
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb34
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1856
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb44
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1856
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb44
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1872
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb54
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1872
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb54
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1888
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb64
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1888
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb64
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1904
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb74
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1904
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb74
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1920
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb84
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1920
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1936
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xb94
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1936
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb94
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1952
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xba4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1952
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xba4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1968
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xbb4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1968
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xbb4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1984
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xbc4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1984
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xbc4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2000
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xbd4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2000
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xbd4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2016
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xbe4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2016
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xbe4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2032
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xbf4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2032
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xbf4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2048
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc04
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2048
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc04
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2064
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc14
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2064
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc14
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2080
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc24
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2080
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc24
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2096
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc34
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2096
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc34
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2112
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc44
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2112
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc44
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2128
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc54
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2128
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc54
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2144
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc64
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2144
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc64
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2160
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc74
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2160
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc74
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2176
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc84
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2176
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2192
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xc94
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2192
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc94
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2208
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xca4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2208
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xca4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2224
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xcb4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2224
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xcb4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2240
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xcc4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2240
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xcc4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2256
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xcd4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2256
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xcd4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2272
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xce4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2272
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xce4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2288
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xcf4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2288
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xcf4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2304
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd04
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2304
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd04
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2320
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd14
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2320
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd14
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2336
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd24
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2336
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd24
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2352
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd34
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2352
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd34
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2368
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd44
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2368
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd44
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2384
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd54
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2384
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd54
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2400
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd64
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2400
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd64
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2416
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd74
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2416
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd74
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2432
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd84
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2432
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2448
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xd94
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2448
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd94
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2464
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xda4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2464
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xda4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2480
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xdb4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2480
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xdb4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2496
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xdc4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2496
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xdc4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2512
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xdd4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2512
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xdd4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2528
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xde4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2528
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xde4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2544
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xdf4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2544
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xdf4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2560
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe04
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2560
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe04
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2576
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe14
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2576
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe14
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2592
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe24
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2592
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe24
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2608
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe34
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2608
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe34
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2624
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe44
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2624
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe44
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2640
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe54
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2640
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe54
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2656
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe64
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2656
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe64
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2672
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe74
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2672
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe74
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2688
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe84
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2688
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2704
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xe94
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2704
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe94
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2720
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xea4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2720
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xea4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2736
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xeb4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2736
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xeb4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2752
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xec4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2752
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xec4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2768
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xed4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2768
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xed4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2784
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xee4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2784
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xee4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2800
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xef4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2800
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xef4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2816
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf04
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2816
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf04
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2832
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf14
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2832
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf14
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2848
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf24
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2848
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf24
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2864
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf34
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2864
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf34
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2880
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf44
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2880
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf44
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2896
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf54
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2896
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf54
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2912
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf64
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2912
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf64
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2928
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf74
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2928
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf74
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2944
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf84
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2944
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2960
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xf94
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2960
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf94
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2976
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xfa4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2976
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xfa4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2992
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xfb4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2992
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xfb4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3008
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xfc4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3008
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xfc4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3024
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xfd4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3024
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xfd4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3040
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xfe4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3040
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xfe4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3056
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0xff4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3056
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xff4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3072
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1004
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3072
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1004
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3088
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1014
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3088
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1014
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3104
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1024
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3104
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1024
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3120
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1034
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3120
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1034
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3136
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1044
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3136
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1044
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3152
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1054
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3152
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1054
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3168
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1064
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3168
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1064
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3184
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1074
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3184
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1074
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3200
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1084
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3200
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1084
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3216
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1094
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3216
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1094
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3232
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x10a4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3232
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x10a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3248
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x10b4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3248
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x10b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3264
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x10c4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3264
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x10c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3280
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x10d4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3280
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x10d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3296
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x10e4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3296
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x10e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3312
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x10f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3312
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x10f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3328
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1104
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3328
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1104
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3344
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1114
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3344
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1114
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3360
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1124
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3360
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1124
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3376
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1134
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3376
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1134
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3392
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1144
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3392
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1144
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3408
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1154
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3408
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1154
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3424
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1164
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3424
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1164
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3440
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1174
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3440
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1174
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3456
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1184
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3456
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1184
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3472
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1194
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3472
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1194
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3488
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x11a4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3488
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x11a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3504
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x11b4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3504
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x11b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3520
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x11c4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3520
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x11c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3536
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x11d4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3536
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x11d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3552
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x11e4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3552
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x11e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3568
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x11f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3568
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x11f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3584
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1204
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3584
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1204
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3600
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1214
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3600
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1214
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3616
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1224
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3616
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1224
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3632
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1234
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3632
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1234
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3648
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1244
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3648
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1244
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3664
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1254
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3664
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1254
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3680
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1264
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3680
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1264
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3696
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1274
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3696
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1274
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3712
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1284
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3712
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1284
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3728
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1294
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3728
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1294
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3744
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x12a4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3744
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x12a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3760
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x12b4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3760
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x12b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3776
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x12c4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3776
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x12c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3792
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x12d4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3792
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x12d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3808
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x12e4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3808
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x12e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3824
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x12f4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3824
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x12f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3840
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1304
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3840
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1304
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3856
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1314
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3856
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1314
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3872
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1324
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3872
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1324
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3888
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1334
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3888
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1334
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3904
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1344
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3904
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1344
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3920
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1354
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3920
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1354
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3936
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1364
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3936
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1364
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3952
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1374
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3952
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1374
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3968
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1384
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3968
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1384
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:3984
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x1394
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:3984
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1394
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:4000
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x13a4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:4000
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x13a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:4016
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x13b4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:4016
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x13b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:4032
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x13c4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:4032
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x13c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:4048
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x13d4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:4048
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x13d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:4064
-; GFX9-FLATSCR-NEXT: s_movk_i32 s4, 0x13e4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:4064
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x13e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:4080
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x13e4
+; GFX9-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:4080
; GFX9-FLATSCR-NEXT: ;;#ASMSTART
; GFX9-FLATSCR-NEXT: ;;#ASMEND
; GFX9-FLATSCR-NEXT: ;;#ASMSTART
@@ -6314,1035 +6313,1036 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX9-FLATSCR-NEXT: ;;#ASMEND
; GFX9-FLATSCR-NEXT: ;;#ASMSTART
; GFX9-FLATSCR-NEXT: ;;#ASMEND
-; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v4, vcc, s0, v5
+; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v4, vcc, s4, v5
; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:4080
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x13d4
-; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:4064
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x13c4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:4080
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x13d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:4048
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x13b4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:4064
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x13c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:4032
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x13a4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:4048
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x13b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:4016
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1394
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:4032
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x13a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:4000
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1384
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:4016
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1394
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3984
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1374
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:4000
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1384
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3968
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1364
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3984
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1374
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3952
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1354
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3968
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1364
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3936
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1344
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3952
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1354
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3920
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1334
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3936
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1344
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3904
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1324
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3920
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1334
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3888
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1314
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3904
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1324
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3872
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1304
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3888
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1314
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3856
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x12f4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3872
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1304
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3840
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x12e4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3856
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x12f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3824
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x12d4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3840
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x12e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3808
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x12c4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3824
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x12d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3792
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x12b4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3808
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x12c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3776
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x12a4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3792
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x12b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3760
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1294
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3776
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x12a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3744
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1284
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3760
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1294
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3728
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1274
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3744
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1284
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3712
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1264
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3728
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1274
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3696
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1254
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3712
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1264
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3680
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1244
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3696
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1254
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3664
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1234
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3680
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1244
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3648
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1224
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3664
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1234
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3632
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1214
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3648
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1224
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3616
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1204
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3632
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1214
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3600
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x11f4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3616
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1204
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3584
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x11e4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3600
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x11f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3568
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x11d4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3584
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x11e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3552
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x11c4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3568
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x11d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3536
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x11b4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3552
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x11c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3520
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x11a4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3536
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x11b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3504
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1194
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3520
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x11a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3488
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1184
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3504
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1194
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3472
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1174
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3488
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1184
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3456
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1164
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3472
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1174
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3440
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1154
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3456
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1164
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3424
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1144
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3440
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1154
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3408
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1134
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3424
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1144
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3392
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1124
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3408
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1134
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3376
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1114
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3392
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1124
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3360
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1104
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3376
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1114
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3344
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x10f4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3360
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1104
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3328
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x10e4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3344
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x10f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3312
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x10d4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3328
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x10e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3296
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x10c4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3312
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x10d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3280
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x10b4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3296
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x10c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3264
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x10a4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3280
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x10b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3248
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1094
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3264
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x10a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3232
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1084
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3248
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1094
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3216
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1074
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3232
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1084
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3200
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1064
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3216
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1074
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3184
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1054
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3200
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1064
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3168
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1044
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3184
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1054
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3152
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1034
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3168
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1044
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3136
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1024
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3152
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1034
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3120
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1014
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3136
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1024
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3104
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x1004
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3120
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1014
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3088
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xff4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3104
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x1004
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3072
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xfe4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3088
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xff4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3056
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xfd4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3072
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xfe4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3040
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xfc4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3056
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xfd4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3024
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xfb4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3040
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xfc4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:3008
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xfa4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3024
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xfb4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2992
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xf94
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:3008
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xfa4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2976
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xf84
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2992
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf94
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2960
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xf74
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2976
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2944
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xf64
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2960
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf74
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2928
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xf54
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2944
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf64
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2912
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xf44
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2928
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf54
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2896
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xf34
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2912
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf44
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2880
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xf24
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2896
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf34
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2864
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xf14
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2880
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf24
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2848
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xf04
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2864
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf14
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2832
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xef4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2848
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xf04
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2816
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xee4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2832
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xef4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2800
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xed4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2816
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xee4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2784
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xec4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2800
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xed4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2768
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xeb4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2784
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xec4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2752
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xea4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2768
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xeb4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2736
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xe94
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2752
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xea4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2720
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xe84
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2736
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe94
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2704
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xe74
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2720
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2688
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xe64
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2704
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe74
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2672
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xe54
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2688
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe64
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2656
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xe44
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2672
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe54
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2640
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xe34
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2656
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe44
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2624
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xe24
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2640
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe34
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2608
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xe14
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2624
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe24
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2592
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xe04
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2608
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe14
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2576
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xdf4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2592
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xe04
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2560
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xde4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2576
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xdf4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2544
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xdd4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2560
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xde4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2528
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xdc4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2544
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xdd4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2512
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xdb4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2528
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xdc4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2496
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xda4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2512
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xdb4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2480
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xd94
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2496
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xda4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2464
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xd84
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2480
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd94
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2448
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xd74
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2464
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2432
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xd64
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2448
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd74
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2416
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xd54
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2432
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd64
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2400
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xd44
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2416
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd54
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2384
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xd34
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2400
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd44
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2368
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xd24
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2384
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd34
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2352
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xd14
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2368
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd24
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2336
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xd04
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2352
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd14
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2320
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xcf4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2336
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xd04
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2304
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xce4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2320
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xcf4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2288
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xcd4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2304
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xce4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2272
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xcc4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2288
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xcd4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2256
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xcb4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2272
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xcc4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2240
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xca4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2256
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xcb4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2224
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xc94
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2240
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xca4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2208
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xc84
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2224
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc94
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2192
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xc74
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2208
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2176
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xc64
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2192
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc74
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2160
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xc54
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2176
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc64
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2144
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xc44
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2160
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc54
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2128
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xc34
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2144
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc44
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2112
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xc24
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2128
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc34
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2096
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xc14
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2112
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc24
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2080
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xc04
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2096
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc14
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2064
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xbf4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2080
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xc04
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2048
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xbe4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2064
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xbf4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2032
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xbd4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2048
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xbe4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2016
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xbc4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2032
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xbd4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2000
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xbb4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2016
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xbc4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1984
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xba4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2000
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xbb4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1968
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xb94
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1984
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xba4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1952
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xb84
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1968
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb94
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1936
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xb74
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1952
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1920
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xb64
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1936
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb74
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1904
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xb54
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1920
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb64
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1888
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xb44
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1904
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb54
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1872
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xb34
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1888
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb44
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1856
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xb24
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1872
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb34
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1840
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xb14
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1856
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb24
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1824
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xb04
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1840
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb14
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1808
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xaf4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1824
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xb04
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1792
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xae4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1808
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xaf4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1776
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xad4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1792
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xae4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1760
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xac4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1776
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xad4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1744
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xab4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1760
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xac4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1728
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xaa4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1744
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xab4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1712
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xa94
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1728
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xaa4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1696
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xa84
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1712
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa94
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1680
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xa74
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1696
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa84
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1664
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xa64
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1680
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa74
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1648
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xa54
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1664
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa64
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1632
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xa44
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1648
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa54
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1616
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xa34
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1632
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa44
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1600
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xa24
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1616
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa34
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1584
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xa14
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1600
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa24
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1568
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0xa04
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1584
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa14
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1552
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x9f4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1568
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0xa04
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1536
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x9e4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1552
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x9f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1520
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x9d4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1536
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x9e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1504
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x9c4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1520
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x9d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1488
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x9b4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1504
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x9c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1472
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x9a4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1488
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x9b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1456
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x994
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1472
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x9a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1440
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x984
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1456
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x994
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1424
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x974
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1440
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x984
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1408
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x964
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1424
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x974
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1392
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x954
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1408
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x964
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1376
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x944
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1392
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x954
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1360
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x934
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1376
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x944
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1344
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x924
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1360
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x934
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1328
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x914
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1344
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x924
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1312
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x904
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1328
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x914
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1296
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x8f4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1312
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x904
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1280
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x8e4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1296
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x8f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1264
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x8d4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1280
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x8e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1248
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x8c4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1264
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x8d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1232
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x8b4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1248
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x8c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1216
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x8a4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1232
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x8b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1200
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x894
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1216
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x8a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1184
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x884
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1200
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x894
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1168
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x874
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1184
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x884
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1152
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x864
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1168
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x874
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1136
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x854
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1152
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x864
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1120
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x844
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1136
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x854
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1104
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x834
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1120
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x844
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1088
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x824
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1104
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x834
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1072
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x814
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1088
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x824
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1056
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x804
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1072
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x814
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1040
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x7f4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1056
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x804
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1024
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x7e4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1040
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x7f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1008
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x7d4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1024
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x7e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:992
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x7c4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1008
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x7d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:976
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x7b4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:992
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x7c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:960
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x7a4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:976
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x7b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:944
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x794
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:960
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x7a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:928
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x784
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:944
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x794
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:912
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x774
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:928
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x784
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:896
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x764
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:912
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x774
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:880
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x754
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:896
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x764
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:864
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x744
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:880
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x754
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:848
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x734
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:864
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x744
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:832
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x724
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:848
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x734
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:816
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x714
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:832
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x724
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:800
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x704
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:816
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x714
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:784
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x6f4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:800
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x704
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:768
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x6e4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:784
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x6f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:752
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x6d4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:768
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x6e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:736
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x6c4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:752
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x6d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:720
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x6b4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:736
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x6c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:704
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x6a4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:720
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x6b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:688
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x694
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:704
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x6a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:672
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x684
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:688
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x694
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:656
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x674
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:672
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x684
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:640
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x664
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:656
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x674
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:624
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x654
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:640
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x664
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:608
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x644
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:624
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x654
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:592
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x634
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:608
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x644
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:576
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x624
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:592
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x634
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:560
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x614
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:576
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x624
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:544
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x604
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:560
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x614
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:528
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x5f4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:544
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x604
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:512
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x5e4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:528
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x5f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:496
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x5d4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:512
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x5e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:480
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x5c4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:496
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x5d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:464
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x5b4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:480
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x5c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:448
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x5a4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:464
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x5b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:432
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x594
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:448
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x5a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:416
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x584
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:432
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x594
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:400
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x574
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:416
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x584
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:384
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x564
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:400
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x574
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:368
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x554
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:384
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x564
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:352
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x544
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:368
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x554
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:336
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x534
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:352
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x544
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:320
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x524
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:336
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x534
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:304
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x514
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:320
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x524
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:288
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x504
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:304
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x514
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:272
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x4f4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:288
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x504
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:256
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x4e4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:272
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x4f4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:240
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x4d4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:256
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x4e4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:224
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x4c4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:240
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x4d4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:208
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x4b4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:224
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x4c4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:192
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x4a4
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:208
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x4b4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:176
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x494
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:192
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x4a4
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:160
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x484
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:176
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x494
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:144
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x474
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:160
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x484
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:128
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x464
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:144
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x474
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:112
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x454
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:128
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x464
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:96
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x444
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:112
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x454
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:80
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x434
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:96
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x444
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:64
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x424
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:80
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x434
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:48
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x414
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:64
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x424
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:32
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT: s_movk_i32 s2, 0x404
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:48
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x414
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:16
-; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:32
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x404
; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:16
+; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x3f4
; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x3e4
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1)
+; GFX9-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5]
+; GFX9-FLATSCR-NEXT: s_nop 0
; GFX9-FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, 0x400, v4
; GFX9-FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
-; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x3e4
-; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(1)
; GFX9-FLATSCR-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:4080
; GFX9-FLATSCR-NEXT: scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
; GFX9-FLATSCR-NEXT: s_movk_i32 s0, 0x3d4
@@ -7617,14 +7617,14 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: s_addc_u32 s3, s3, 0
; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
; GFX10-FLATSCR-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
-; GFX10-FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-FLATSCR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
; GFX10-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
; GFX10-FLATSCR-NEXT: v_lshlrev_b32_e32 v5, 13, v0
; GFX10-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-FLATSCR-NEXT: v_add_co_u32 v4, s4, s2, v5
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v22, null, s3, 0, s4
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x804
+; GFX10-FLATSCR-NEXT: v_add_co_u32 v4, s0, s6, v5
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v22, null, s7, 0, s0
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x804
; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x80, v4
; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v22, vcc_lo
; GFX10-FLATSCR-NEXT: v_add_co_u32 v2, vcc_lo, 0x100, v4
@@ -8045,795 +8045,795 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], off offset:2036 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:1920
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:1936
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x814
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x814
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:1952
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x824
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x824
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:1968
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x834
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x834
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:1984
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x844
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x844
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:2000
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x854
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x854
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: s_clause 0x1
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:2016
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:2032
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x864
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x864
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(1)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x874
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x874
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x780, v6
; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v7, vcc_lo
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x884
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x884
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:1920
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:1936
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x894
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x894
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:1952
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x8a4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x8a4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:1968
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x8b4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x8b4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:1984
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x8c4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x8c4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:2000
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x8d4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x8d4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: s_clause 0x1
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[20:23], v[0:1], off offset:2016
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:2032
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x8e4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x8e4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(1)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x8f4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x8f4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x780, v8
; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v9, vcc_lo
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x904
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x904
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1920
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1936
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x914
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x914
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1952
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x924
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x924
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1968
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x934
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x934
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1984
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x944
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x944
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:2000
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x954
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x954
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: s_clause 0x1
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:2016
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:2032
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x964
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x964
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(1)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x974
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x974
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x780, v10
; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v11, vcc_lo
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x984
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x984
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1920
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1936
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x994
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x994
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1952
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x9a4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x9a4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1968
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x9b4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x9b4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1984
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x9c4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x9c4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:2000
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x9d4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x9d4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: s_clause 0x1
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:2016
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:2032
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x9e4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x9e4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(1)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x9f4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x9f4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x780, v12
; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v13, vcc_lo
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xa04
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xa04
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1920
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1936
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xa14
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xa14
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1952
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xa24
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xa24
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1968
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xa34
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xa34
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1984
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xa44
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xa44
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:2000
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xa54
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xa54
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: s_clause 0x1
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:2016
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:2032
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xa64
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xa64
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(1)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xa74
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xa74
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x780, v14
; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v15, vcc_lo
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xa84
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xa84
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1920
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1936
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xa94
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xa94
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1952
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xaa4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xaa4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1968
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xab4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xab4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1984
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xac4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xac4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:2000
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xad4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xad4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: s_clause 0x1
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:2016
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:2032
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xae4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xae4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(1)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xaf4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xaf4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x780, v16
; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v17, vcc_lo
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xb04
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xb04
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1920
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1936
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xb14
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xb14
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1952
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xb24
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xb24
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1968
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xb34
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xb34
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1984
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xb44
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xb44
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:2000
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xb54
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xb54
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: s_clause 0x1
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:2016
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:2032
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xb64
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xb64
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(1)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xb74
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xb74
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x780, v18
; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v19, vcc_lo
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xb84
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xb84
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1920
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1936
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xb94
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xb94
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1952
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xba4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xba4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1968
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xbb4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xbb4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:1984
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xbc4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xbc4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:2000
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xbd4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xbd4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
; GFX10-FLATSCR-NEXT: s_clause 0x1
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:2016
; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:2032
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xbe4
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xbe4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(1)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xbf4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xbf4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3]
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xc04
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7]
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc04
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:16
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xc14
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:16
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc14
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:32
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xc24
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:32
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc24
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:48
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xc34
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:48
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc34
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:64
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xc44
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:64
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc44
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:80
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xc54
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:80
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc54
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:96
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xc64
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:96
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc64
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:112
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xc74
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:112
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc74
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:128
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xc84
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:128
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc84
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:144
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xc94
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:144
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc94
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:160
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xca4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:160
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xca4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:176
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xcb4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:176
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xcb4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:192
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xcc4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:192
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xcc4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:208
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xcd4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:208
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xcd4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:224
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xce4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:224
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xce4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:240
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xcf4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:240
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xcf4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:256
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xd04
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:256
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd04
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:272
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xd14
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:272
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd14
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:288
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xd24
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:288
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd24
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:304
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xd34
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:304
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd34
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:320
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xd44
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:320
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd44
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:336
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xd54
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:336
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd54
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:352
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xd64
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:352
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd64
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:368
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xd74
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:368
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd74
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:384
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xd84
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:384
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd84
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:400
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xd94
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:400
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd94
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:416
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xda4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:416
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xda4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:432
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xdb4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:432
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xdb4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:448
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xdc4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:448
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xdc4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:464
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xdd4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:464
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xdd4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:480
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xde4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:480
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xde4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:496
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xdf4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:496
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xdf4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:512
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xe04
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:512
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe04
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:528
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xe14
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:528
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe14
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:544
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xe24
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:544
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe24
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:560
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xe34
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:560
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe34
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:576
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xe44
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:576
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe44
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:592
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xe54
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:592
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe54
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:608
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xe64
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:608
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe64
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:624
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xe74
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:624
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe74
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:640
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xe84
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:640
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe84
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:656
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xe94
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:656
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe94
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:672
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xea4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:672
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xea4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:688
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xeb4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:688
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xeb4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:704
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xec4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:704
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xec4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:720
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xed4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:720
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xed4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:736
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xee4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:736
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xee4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:752
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xef4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:752
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xef4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:768
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xf04
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:768
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf04
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:784
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xf14
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:784
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf14
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:800
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xf24
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:800
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf24
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:816
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xf34
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:816
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf34
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:832
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xf44
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:832
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf44
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:848
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xf54
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:848
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf54
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:864
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xf64
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:864
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf64
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:880
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xf74
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:880
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf74
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:896
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xf84
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:896
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf84
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:912
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xf94
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:912
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf94
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:928
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xfa4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:928
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xfa4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:944
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xfb4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:944
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xfb4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:960
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xfc4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:960
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xfc4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:976
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xfd4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:976
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xfd4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:992
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xfe4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:992
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xfe4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1008
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0xff4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1008
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xff4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1024
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1004
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1024
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1004
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1040
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1014
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1040
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1014
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1056
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1024
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1056
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1024
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1072
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1034
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1072
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1034
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1088
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1044
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1088
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1044
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1104
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1054
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1104
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1054
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1120
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1064
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1120
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1064
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1136
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1074
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1136
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1074
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1152
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1084
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1152
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1084
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1168
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1094
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1168
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1094
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1184
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x10a4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1184
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x10a4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1200
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x10b4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1200
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x10b4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1216
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x10c4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1216
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x10c4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1232
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x10d4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1232
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x10d4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1248
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x10e4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1248
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x10e4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1264
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x10f4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1264
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x10f4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1280
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1104
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1280
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1104
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1296
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1114
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1296
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1114
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1312
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1124
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1312
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1124
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1328
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1134
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1328
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1134
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1344
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1144
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1344
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1144
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1360
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1154
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1360
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1154
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1376
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1164
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1376
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1164
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1392
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1174
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1392
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1174
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1408
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1184
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1408
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1184
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1424
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1194
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1424
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1194
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1440
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x11a4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1440
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x11a4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1456
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x11b4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1456
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x11b4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1472
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x11c4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1472
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x11c4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1488
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x11d4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1488
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x11d4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1504
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x11e4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1504
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x11e4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1520
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x11f4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1520
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x11f4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1536
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1204
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1536
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1204
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1552
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1214
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1552
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1214
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1568
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1224
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1568
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1224
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1584
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1234
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1584
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1234
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1600
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1244
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1600
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1244
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1616
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1254
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1616
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1254
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1632
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1264
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1632
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1264
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1648
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1274
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1648
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1274
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1664
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1284
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1664
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1284
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1680
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1294
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1680
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1294
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1696
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x12a4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1696
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x12a4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1712
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x12b4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1712
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x12b4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1728
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x12c4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1728
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x12c4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1744
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x12d4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1744
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x12d4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1760
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x12e4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1760
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x12e4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1776
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x12f4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1776
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x12f4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1792
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1304
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1792
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1304
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1808
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1314
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1808
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1314
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1824
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1324
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1824
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1324
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1840
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1334
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1840
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1334
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1856
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1344
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1856
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1344
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1872
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1354
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1872
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1354
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1888
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1364
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1888
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1364
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1904
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1374
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1904
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1374
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1920
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1384
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1920
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1384
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1936
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x1394
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1936
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1394
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1952
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x13a4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1952
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x13a4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1968
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x13b4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1968
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x13b4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:1984
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x13c4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:1984
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x13c4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2000
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x13d4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2000
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x13d4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2016
-; GFX10-FLATSCR-NEXT: s_movk_i32 s4, 0x13e4
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2016
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x13e4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[2:3] offset:2032
+; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT: global_load_dwordx4 v[0:3], v5, s[6:7] offset:2032
; GFX10-FLATSCR-NEXT: ;;#ASMSTART
; GFX10-FLATSCR-NEXT: ;;#ASMEND
-; GFX10-FLATSCR-NEXT: v_add_co_u32 v4, s2, s0, v5
-; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v6, null, s1, 0, s2
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x13e4
+; GFX10-FLATSCR-NEXT: v_add_co_u32 v4, s0, s4, v5
+; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e64 v6, null, s5, 0, s0
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x13e4
; GFX10-FLATSCR-NEXT: ;;#ASMSTART
; GFX10-FLATSCR-NEXT: ;;#ASMEND
; GFX10-FLATSCR-NEXT: ;;#ASMSTART
@@ -8847,520 +8847,520 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; GFX10-FLATSCR-NEXT: ;;#ASMSTART
; GFX10-FLATSCR-NEXT: ;;#ASMEND
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2032
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x13d4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2032
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x13d4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2016
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x13c4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2016
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x13c4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:2000
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x13b4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:2000
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x13b4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1984
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x13a4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1984
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x13a4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1968
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1394
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1968
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1394
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1952
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1384
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1952
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1384
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1936
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1374
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1936
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1374
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1920
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1364
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1920
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1364
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1904
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1354
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1904
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1354
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1888
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1344
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1888
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1344
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1872
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1334
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1872
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1334
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1856
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1324
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1856
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1324
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1840
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1314
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1840
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1314
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1824
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1304
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1824
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1304
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1808
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x12f4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1808
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x12f4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1792
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x12e4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1792
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x12e4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1776
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x12d4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1776
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x12d4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1760
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x12c4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1760
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x12c4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1744
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x12b4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1744
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x12b4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1728
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x12a4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1728
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x12a4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1712
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1294
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1712
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1294
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1696
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1284
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1696
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1284
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1680
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1274
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1680
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1274
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1664
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1264
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1664
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1264
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1648
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1254
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1648
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1254
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1632
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1244
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1632
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1244
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1616
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1234
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1616
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1234
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1600
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1224
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1600
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1224
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1584
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1214
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1584
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1214
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1568
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1204
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1568
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1204
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1552
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x11f4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1552
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x11f4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1536
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x11e4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1536
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x11e4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1520
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x11d4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1520
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x11d4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1504
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x11c4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1504
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x11c4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1488
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x11b4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1488
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x11b4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1472
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x11a4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1472
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x11a4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1456
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1194
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1456
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1194
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1440
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1184
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1440
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1184
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1424
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1174
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1424
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1174
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1408
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1164
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1408
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1164
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1392
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1154
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1392
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1154
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1376
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1144
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1376
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1144
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1360
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1134
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1360
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1134
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1344
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1124
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1344
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1124
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1328
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1114
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1328
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1114
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1312
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1104
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1312
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1104
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1296
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x10f4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1296
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x10f4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1280
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x10e4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1280
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x10e4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1264
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x10d4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1264
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x10d4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1248
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x10c4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1248
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x10c4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1232
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x10b4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1232
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x10b4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1216
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x10a4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1216
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x10a4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1200
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1094
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1200
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1094
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1184
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1084
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1184
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1084
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1168
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1074
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1168
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1074
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1152
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1064
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1152
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1064
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1136
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1054
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1136
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1054
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1120
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1044
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1120
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1044
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1104
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1034
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1104
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1034
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1088
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1024
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1088
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1024
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1072
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1014
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1072
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1014
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1056
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0x1004
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1056
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0x1004
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1040
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xff4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1040
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xff4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1024
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xfe4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1024
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xfe4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:1008
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xfd4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:1008
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xfd4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:992
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xfc4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:992
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xfc4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:976
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xfb4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:976
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xfb4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:960
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xfa4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:960
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xfa4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:944
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xf94
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:944
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf94
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:928
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xf84
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:928
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf84
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:912
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xf74
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:912
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf74
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:896
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xf64
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:896
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf64
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:880
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xf54
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:880
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf54
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:864
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xf44
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:864
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf44
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:848
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xf34
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:848
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf34
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:832
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xf24
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:832
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf24
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:816
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xf14
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:816
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf14
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:800
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xf04
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:800
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xf04
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:784
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xef4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:784
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xef4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:768
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xee4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:768
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xee4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:752
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xed4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:752
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xed4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:736
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xec4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:736
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xec4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:720
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xeb4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:720
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xeb4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:704
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xea4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:704
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xea4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:688
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xe94
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:688
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe94
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:672
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xe84
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:672
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe84
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:656
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xe74
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:656
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe74
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:640
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xe64
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:640
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe64
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:624
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xe54
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:624
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe54
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:608
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xe44
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:608
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe44
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:592
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xe34
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:592
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe34
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:576
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xe24
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:576
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe24
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:560
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xe14
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:560
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe14
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:544
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xe04
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:544
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xe04
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:528
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xdf4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:528
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xdf4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:512
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xde4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:512
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xde4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:496
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xdd4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:496
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xdd4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:480
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xdc4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:480
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xdc4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:464
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xdb4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:464
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xdb4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:448
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xda4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:448
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xda4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:432
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xd94
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:432
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd94
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:416
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xd84
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:416
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd84
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:400
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xd74
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:400
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd74
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:384
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xd64
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:384
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd64
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:368
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xd54
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:368
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd54
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:352
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xd44
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:352
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd44
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:336
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xd34
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:336
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd34
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:320
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xd24
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:320
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd24
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:304
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xd14
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:304
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd14
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:288
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xd04
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:288
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xd04
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:272
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xcf4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:272
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xcf4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:256
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xce4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:256
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xce4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:240
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xcd4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:240
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xcd4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:224
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xcc4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:224
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xcc4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:208
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xcb4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:208
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xcb4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:192
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xca4
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:192
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xca4
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:176
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xc94
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:176
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc94
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:160
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xc84
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:160
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc84
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:144
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xc74
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:144
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc74
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:128
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xc64
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:128
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc64
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:112
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xc54
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:112
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc54
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:96
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xc44
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:96
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc44
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:80
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xc34
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:80
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc34
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:64
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xc24
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:64
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc24
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:48
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xc14
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:48
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc14
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:32
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_movk_i32 s2, 0xc04
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:32
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xc04
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1] offset:16
-; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s2 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5] offset:16
+; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xbf4
-; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x480, v4
; GFX10-FLATSCR-NEXT: scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xbe4
+; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(1)
+; GFX10-FLATSCR-NEXT: global_store_dwordx4 v5, v[0:3], s[4:5]
+; GFX10-FLATSCR-NEXT: v_add_co_u32 v0, vcc_lo, 0x480, v4
; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v6, vcc_lo
; GFX10-FLATSCR-NEXT: v_add_co_u32 v2, vcc_lo, 0x780, v0
-; GFX10-FLATSCR-NEXT: s_movk_i32 s0, 0xbe4
; GFX10-FLATSCR-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
; GFX10-FLATSCR-NEXT: s_waitcnt vmcnt(0)
; GFX10-FLATSCR-NEXT: global_store_dwordx4 v[2:3], v[7:10], off offset:2032
diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll
index b8cf692372069..64277e8de7f14 100644
--- a/llvm/test/CodeGen/AMDGPU/sra.ll
+++ b/llvm/test/CodeGen/AMDGPU/sra.ll
@@ -27,21 +27,21 @@ define amdgpu_kernel void @ashr_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; VI-LABEL: ashr_v2i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ashrrev_i32_e32 v1, v3, v1
; VI-NEXT: v_ashrrev_i32_e32 v0, v2, v0
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: ashr_v2i32:
@@ -94,24 +94,24 @@ define amdgpu_kernel void @ashr_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; VI-LABEL: ashr_v4i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ashrrev_i32_e32 v3, v7, v3
; VI-NEXT: v_ashrrev_i32_e32 v2, v6, v2
; VI-NEXT: v_ashrrev_i32_e32 v1, v5, v1
; VI-NEXT: v_ashrrev_i32_e32 v0, v4, v0
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: ashr_v4i32:
@@ -175,31 +175,31 @@ define amdgpu_kernel void @ashr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; VI-LABEL: ashr_v2i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_readfirstlane_b32 s0, v1
-; VI-NEXT: v_readfirstlane_b32 s1, v0
-; VI-NEXT: s_ashr_i32 s2, s1, 16
-; VI-NEXT: s_sext_i32_i16 s1, s1
-; VI-NEXT: s_ashr_i32 s3, s0, 16
-; VI-NEXT: s_sext_i32_i16 s0, s0
-; VI-NEXT: s_ashr_i32 s0, s1, s0
-; VI-NEXT: s_ashr_i32 s1, s2, s3
-; VI-NEXT: s_lshl_b32 s1, s1, 16
-; VI-NEXT: s_and_b32 s0, s0, 0xffff
-; VI-NEXT: s_or_b32 s0, s0, s1
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: v_readfirstlane_b32 s4, v1
+; VI-NEXT: v_readfirstlane_b32 s5, v0
+; VI-NEXT: s_ashr_i32 s6, s5, 16
+; VI-NEXT: s_sext_i32_i16 s5, s5
+; VI-NEXT: s_ashr_i32 s7, s4, 16
+; VI-NEXT: s_sext_i32_i16 s4, s4
+; VI-NEXT: s_ashr_i32 s4, s5, s4
+; VI-NEXT: s_ashr_i32 s5, s6, s7
+; VI-NEXT: s_lshl_b32 s5, s5, 16
+; VI-NEXT: s_and_b32 s4, s4, 0xffff
+; VI-NEXT: s_or_b32 s4, s4, s5
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: ashr_v2i16:
@@ -282,43 +282,43 @@ define amdgpu_kernel void @ashr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; VI-LABEL: ashr_v4i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_readfirstlane_b32 s0, v2
-; VI-NEXT: v_readfirstlane_b32 s1, v3
-; VI-NEXT: v_readfirstlane_b32 s2, v0
-; VI-NEXT: v_readfirstlane_b32 s3, v1
-; VI-NEXT: s_ashr_i32 s8, s3, 16
-; VI-NEXT: s_sext_i32_i16 s3, s3
-; VI-NEXT: s_ashr_i32 s9, s2, 16
-; VI-NEXT: s_sext_i32_i16 s2, s2
-; VI-NEXT: s_ashr_i32 s10, s1, 16
-; VI-NEXT: s_sext_i32_i16 s1, s1
-; VI-NEXT: s_ashr_i32 s11, s0, 16
-; VI-NEXT: s_sext_i32_i16 s0, s0
-; VI-NEXT: s_ashr_i32 s0, s2, s0
-; VI-NEXT: s_ashr_i32 s2, s9, s11
-; VI-NEXT: s_ashr_i32 s1, s3, s1
-; VI-NEXT: s_ashr_i32 s3, s8, s10
-; VI-NEXT: s_lshl_b32 s3, s3, 16
-; VI-NEXT: s_and_b32 s1, s1, 0xffff
-; VI-NEXT: s_lshl_b32 s2, s2, 16
-; VI-NEXT: s_and_b32 s0, s0, 0xffff
-; VI-NEXT: s_or_b32 s1, s1, s3
-; VI-NEXT: s_or_b32 s0, s0, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: v_readfirstlane_b32 s4, v2
+; VI-NEXT: v_readfirstlane_b32 s5, v3
+; VI-NEXT: v_readfirstlane_b32 s6, v0
+; VI-NEXT: v_readfirstlane_b32 s7, v1
+; VI-NEXT: s_ashr_i32 s8, s7, 16
+; VI-NEXT: s_sext_i32_i16 s7, s7
+; VI-NEXT: s_ashr_i32 s9, s6, 16
+; VI-NEXT: s_sext_i32_i16 s6, s6
+; VI-NEXT: s_ashr_i32 s10, s5, 16
+; VI-NEXT: s_sext_i32_i16 s5, s5
+; VI-NEXT: s_ashr_i32 s11, s4, 16
+; VI-NEXT: s_sext_i32_i16 s4, s4
+; VI-NEXT: s_ashr_i32 s4, s6, s4
+; VI-NEXT: s_ashr_i32 s6, s9, s11
+; VI-NEXT: s_ashr_i32 s5, s7, s5
+; VI-NEXT: s_ashr_i32 s7, s8, s10
+; VI-NEXT: s_lshl_b32 s7, s7, 16
+; VI-NEXT: s_and_b32 s5, s5, 0xffff
+; VI-NEXT: s_lshl_b32 s6, s6, 16
+; VI-NEXT: s_and_b32 s4, s4, 0xffff
+; VI-NEXT: s_or_b32 s5, s5, s7
+; VI-NEXT: s_or_b32 s4, s4, s6
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: ashr_v4i16:
@@ -409,16 +409,16 @@ define amdgpu_kernel void @s_ashr_i64(ptr addrspace(1) %out, i32 %in) {
;
; VI-LABEL: s_ashr_i64:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ashr_i32 s5, s4, 31
-; VI-NEXT: s_ashr_i64 s[4:5], s[4:5], 8
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_ashr_i32 s3, s2, 31
+; VI-NEXT: s_ashr_i64 s[0:1], s[2:3], 8
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_ashr_i64:
@@ -461,20 +461,20 @@ define amdgpu_kernel void @ashr_i64_2(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; VI-LABEL: ashr_i64_2:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ashrrev_i64 v[0:1], v2, v[0:1]
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: ashr_i64_2:
@@ -533,22 +533,22 @@ define amdgpu_kernel void @ashr_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; VI-LABEL: ashr_v2i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ashrrev_i64 v[2:3], v6, v[2:3]
; VI-NEXT: v_ashrrev_i64 v[0:1], v4, v[0:1]
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: ashr_v2i64:
@@ -730,18 +730,18 @@ define amdgpu_kernel void @s_ashr_32_i64(ptr addrspace(1) %out, [8 x i32], i64 %
;
; VI-LABEL: s_ashr_32_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s6, s[0:1], 0x50
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s8, s[0:1], 0x50
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x74
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ashr_i32 s7, s6, 31
-; VI-NEXT: s_add_u32 s4, s6, s4
-; VI-NEXT: s_addc_u32 s5, s7, s5
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_ashr_i32 s1, s8, 31
+; VI-NEXT: s_add_u32 s0, s8, s2
+; VI-NEXT: s_addc_u32 s1, s1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_ashr_32_i64:
@@ -785,17 +785,17 @@ define amdgpu_kernel void @v_ashr_32_i64(ptr addrspace(1) %out, ptr addrspace(1)
;
; VI-LABEL: v_ashr_32_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s3
-; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v0, s7
+; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
@@ -849,18 +849,18 @@ define amdgpu_kernel void @s_ashr_63_i64(ptr addrspace(1) %out, [8 x i32], i64 %
;
; VI-LABEL: s_ashr_63_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s6, s[0:1], 0x50
-; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x74
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s8, s[0:1], 0x50
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x74
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_ashr_i32 s6, s6, 31
-; VI-NEXT: s_add_u32 s4, s6, s4
-; VI-NEXT: s_addc_u32 s5, s6, s5
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_ashr_i32 s1, s8, 31
+; VI-NEXT: s_add_u32 s0, s1, s2
+; VI-NEXT: s_addc_u32 s1, s1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_ashr_63_i64:
@@ -905,17 +905,17 @@ define amdgpu_kernel void @v_ashr_63_i64(ptr addrspace(1) %out, ptr addrspace(1)
;
; VI-LABEL: v_ashr_63_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s3
-; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v0, s7
+; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ashrrev_i32_e32 v2, 31, v3
diff --git a/llvm/test/CodeGen/AMDGPU/srl.ll b/llvm/test/CodeGen/AMDGPU/srl.ll
index 418c160d4244a..d33723cff9629 100644
--- a/llvm/test/CodeGen/AMDGPU/srl.ll
+++ b/llvm/test/CodeGen/AMDGPU/srl.ll
@@ -26,15 +26,15 @@ define amdgpu_kernel void @lshr_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
;
; VI-LABEL: lshr_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s4, s4, s5
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_lshr_b32 s0, s0, s1
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: lshr_i32:
@@ -83,17 +83,17 @@ define amdgpu_kernel void @lshr_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; VI-LABEL: lshr_v2i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b32 s5, s5, s7
-; VI-NEXT: s_lshr_b32 s4, s4, s6
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshr_b32 s1, s1, s3
+; VI-NEXT: s_lshr_b32 s0, s0, s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: lshr_v2i32:
@@ -212,16 +212,16 @@ define amdgpu_kernel void @lshr_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
;
; VI-LABEL: lshr_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_lshr_b64 s[4:5], s[4:5], s6
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: lshr_i64:
@@ -382,14 +382,14 @@ define amdgpu_kernel void @s_lshr_32_i64(ptr addrspace(1) %out, [8 x i32], i64 %
;
; VI-LABEL: s_lshr_32_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s4, s[0:1], 0x50
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x50
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; EG-LABEL: s_lshr_32_i64:
@@ -428,17 +428,17 @@ define amdgpu_kernel void @v_lshr_32_i64(ptr addrspace(1) %out, ptr addrspace(1)
;
; VI-LABEL: v_lshr_32_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s3
-; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v2
+; VI-NEXT: v_mov_b32_e32 v0, s7
+; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v0, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s5
+; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll
index 45aa544bf80af..6175d49930cb6 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.ll
@@ -72,13 +72,13 @@ define amdgpu_kernel void @s_sub_imm_i32(ptr addrspace(1) %out, i32 %a) {
;
; GFX8-LABEL: s_sub_imm_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_sub_i32 s2, 0x4d2, s2
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: s_sub_i32 s0, 0x4d2, s4
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -130,13 +130,13 @@ define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX8-LABEL: test_sub_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v1
; GFX8-NEXT: flat_store_dword v[2:3], v0
@@ -144,24 +144,24 @@ define amdgpu_kernel void @test_sub_i32(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX9-LABEL: test_sub_i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: test_sub_i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[0:1], v2, s[2:3]
+; GFX12-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_sub_nc_u32_e32 v0, v0, v1
-; GFX12-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v2, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -194,13 +194,13 @@ define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace
;
; GFX8-LABEL: test_sub_imm_i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_dword v2, v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 0x7b, v2
; GFX8-NEXT: flat_store_dword v[0:1], v2
@@ -208,24 +208,24 @@ define amdgpu_kernel void @test_sub_imm_i32(ptr addrspace(1) %out, ptr addrspace
;
; GFX9-LABEL: test_sub_imm_i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u32_e32 v1, 0x7b, v1
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: test_sub_imm_i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v0, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v1, v0, s[2:3]
+; GFX12-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_sub_nc_u32_e32 v1, 0x7b, v1
-; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -257,13 +257,13 @@ define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1
;
; GFX8-LABEL: test_sub_v2i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v3
; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2
@@ -272,26 +272,26 @@ define amdgpu_kernel void @test_sub_v2i32(ptr addrspace(1) %out, ptr addrspace(1
;
; GFX9-LABEL: test_sub_v2i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u32_e32 v1, v1, v3
; GFX9-NEXT: v_sub_u32_e32 v0, v0, v2
-; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: test_sub_v2i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b128 v[0:3], v4, s[2:3]
+; GFX12-NEXT: global_load_b128 v[0:3], v4, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_sub_nc_u32_e32 v1, v1, v3
; GFX12-NEXT: v_sub_nc_u32_e32 v0, v0, v2
-; GFX12-NEXT: global_store_b64 v4, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v4, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -328,18 +328,18 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1
;
; GFX8-LABEL: test_sub_v4i32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: s_add_u32 s2, s2, 16
-; GFX8-NEXT: s_addc_u32 s3, s3, 0
-; GFX8-NEXT: v_mov_b32_e32 v5, s3
-; GFX8-NEXT: v_mov_b32_e32 v4, s2
+; GFX8-NEXT: s_add_u32 s0, s6, 16
+; GFX8-NEXT: s_addc_u32 s1, s7, 0
+; GFX8-NEXT: v_mov_b32_e32 v0, s6
+; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v8, s0
-; GFX8-NEXT: v_mov_b32_e32 v9, s1
+; GFX8-NEXT: v_mov_b32_e32 v8, s4
+; GFX8-NEXT: v_mov_b32_e32 v9, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v7
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v6
@@ -350,33 +350,33 @@ define amdgpu_kernel void @test_sub_v4i32(ptr addrspace(1) %out, ptr addrspace(1
;
; GFX9-LABEL: test_sub_v4i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:16
-; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3]
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[6:7] offset:16
+; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u32_e32 v3, v7, v3
; GFX9-NEXT: v_sub_u32_e32 v2, v6, v2
; GFX9-NEXT: v_sub_u32_e32 v1, v5, v1
; GFX9-NEXT: v_sub_u32_e32 v0, v4, v0
-; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: test_sub_v4i32:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_mov_b32_e32 v8, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_clause 0x1
-; GFX12-NEXT: global_load_b128 v[0:3], v8, s[2:3] offset:16
-; GFX12-NEXT: global_load_b128 v[4:7], v8, s[2:3]
+; GFX12-NEXT: global_load_b128 v[0:3], v8, s[6:7] offset:16
+; GFX12-NEXT: global_load_b128 v[4:7], v8, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_sub_nc_u32_e32 v3, v7, v3
; GFX12-NEXT: v_sub_nc_u32_e32 v2, v6, v2
; GFX12-NEXT: v_sub_nc_u32_e32 v1, v5, v1
; GFX12-NEXT: v_sub_nc_u32_e32 v0, v4, v0
-; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
+; GFX12-NEXT: global_store_b128 v8, v[0:3], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -412,11 +412,11 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX8-LABEL: test_sub_i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: v_add_u32_e32 v2, vcc, 2, v0
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
@@ -424,38 +424,38 @@ define amdgpu_kernel void @test_sub_i16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_load_ushort v2, v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_sub_u16_e32 v2, v4, v2
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: test_sub_i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] offset:2 glc
+; GFX9-NEXT: global_load_ushort v2, v0, s[6:7] offset:2 glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_sub_u16_e32 v1, v1, v2
-; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: test_sub_i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u16 v1, v0, s[6:7] scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_load_u16 v0, v0, s[2:3] offset:2 scope:SCOPE_SYS
+; GFX12-NEXT: global_load_u16 v0, v0, s[6:7] offset:2 scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_sub_nc_u16 v0, v1, v0
-; GFX12-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX12-NEXT: global_store_b16 v2, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -497,15 +497,15 @@ define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1
;
; GFX8-LABEL: test_sub_v2i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_sub_u16_e32 v4, v0, v1
; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -515,26 +515,26 @@ define amdgpu_kernel void @test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1
;
; GFX9-LABEL: test_sub_v2i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1
-; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT: global_store_dword v2, v0, s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: test_sub_v2i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3]
+; GFX12-NEXT: global_load_b64 v[0:1], v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_sub_i16 v0, v0, v1
-; GFX12-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX12-NEXT: global_store_b32 v2, v0, s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -583,15 +583,15 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1
;
; GFX8-LABEL: test_sub_v4i16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s7
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_sub_u16_e32 v6, v1, v3
; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -604,28 +604,28 @@ define amdgpu_kernel void @test_sub_v4i16(ptr addrspace(1) %out, ptr addrspace(1
;
; GFX9-LABEL: test_sub_v4i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3]
+; GFX9-NEXT: global_load_dwordx4 v[0:3], v0, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_i16 v1, v1, v3
; GFX9-NEXT: v_pk_sub_i16 v0, v0, v2
-; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
;
; GFX12-LABEL: test_sub_v4i16:
; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b128 v[0:3], v0, s[2:3]
+; GFX12-NEXT: global_load_b128 v[0:3], v0, s[6:7]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_pk_sub_i16 v1, v1, v3
; GFX12-NEXT: v_pk_sub_i16 v0, v0, v2
-; GFX12-NEXT: global_store_b64 v4, v[0:1], s[0:1]
+; GFX12-NEXT: global_store_b64 v4, v[0:1], s[4:5]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -657,14 +657,14 @@ define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64
; GFX8-LABEL: s_sub_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_sub_u32 s2, s4, s6
-; GFX8-NEXT: s_subb_u32 s3, s5, s7
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: s_sub_u32 s0, s4, s6
+; GFX8-NEXT: s_subb_u32 s1, s5, s7
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
;
@@ -685,12 +685,12 @@ define amdgpu_kernel void @s_sub_i64(ptr addrspace(1) noalias %out, i64 %a, i64
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x2c
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_sub_nc_u64 s[2:3], s[4:5], s[6:7]
+; GFX12-NEXT: s_sub_nc_u64 s[0:1], s[4:5], s[6:7]
; GFX12-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -726,14 +726,14 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac
; GFX8-LABEL: v_sub_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
@@ -764,13 +764,13 @@ define amdgpu_kernel void @v_sub_i64(ptr addrspace(1) noalias %out, ptr addrspac
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX12-NEXT: v_mov_b32_e32 v4, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_load_b64 v[0:1], v2, s[6:7]
-; GFX12-NEXT: global_load_b64 v[2:3], v2, s[0:1]
+; GFX12-NEXT: global_load_b64 v[2:3], v2, s[2:3]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2
; GFX12-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
@@ -817,14 +817,14 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace
; GFX8-LABEL: v_test_sub_v2i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 4, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, s0, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, s2, v2
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v3, vcc
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
@@ -859,13 +859,13 @@ define amdgpu_kernel void @v_test_sub_v2i64(ptr addrspace(1) %out, ptr addrspace
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; GFX12-NEXT: v_mov_b32_e32 v8, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: global_load_b128 v[0:3], v4, s[6:7]
-; GFX12-NEXT: global_load_b128 v[4:7], v4, s[0:1]
+; GFX12-NEXT: global_load_b128 v[4:7], v4, s[2:3]
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v2, v6
; GFX12-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
@@ -922,14 +922,14 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace
; GFX8-LABEL: v_test_sub_v4i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 5, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s7
; GFX8-NEXT: v_add_u32_e32 v8, vcc, s6, v0
; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_add_u32_e32 v12, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v12, vcc, s2, v0
; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dwordx4 v[0:3], v[8:9]
; GFX8-NEXT: flat_load_dwordx4 v[4:7], v[12:13]
@@ -988,15 +988,15 @@ define amdgpu_kernel void @v_test_sub_v4i64(ptr addrspace(1) %out, ptr addrspace
; GFX12: ; %bb.0:
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX12-NEXT: v_lshlrev_b32_e32 v12, 5, v0
; GFX12-NEXT: v_mov_b32_e32 v16, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_clause 0x3
; GFX12-NEXT: global_load_b128 v[0:3], v12, s[6:7]
-; GFX12-NEXT: global_load_b128 v[4:7], v12, s[0:1]
+; GFX12-NEXT: global_load_b128 v[4:7], v12, s[2:3]
; GFX12-NEXT: global_load_b128 v[8:11], v12, s[6:7] offset:16
-; GFX12-NEXT: global_load_b128 v[12:15], v12, s[0:1] offset:16
+; GFX12-NEXT: global_load_b128 v[12:15], v12, s[2:3] offset:16
; GFX12-NEXT: s_wait_loadcnt 0x2
; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v2, v6
; GFX12-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v7, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
index 6c53afe840d18..c1e72567a9095 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -25,14 +25,14 @@ define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace
; VI-LABEL: v_test_sub_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -68,12 +68,12 @@ define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s6, -1
@@ -114,12 +114,12 @@ define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace
; VI-LABEL: s_test_sub_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s6, s[6:7], 0x0
-; VI-NEXT: s_load_dword s7, s[0:1], 0x0
+; VI-NEXT: s_load_dword s7, s[8:9], 0x0
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
@@ -153,14 +153,14 @@ define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s2, s[6:7], 0x0
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[6:7], 0x0
+; GFX11-NEXT: s_load_b32 s1, s[2:3], 0x0
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_pk_sub_i16 v0, s2, s0
+; GFX11-NEXT: v_pk_sub_i16 v0, s0, s1
; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -175,32 +175,32 @@ define amdgpu_kernel void @s_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace
define amdgpu_kernel void @s_test_sub_self_v2i16(ptr addrspace(1) %out, ptr addrspace(4) %in0) #1 {
; GCN-LABEL: s_test_sub_self_v2i16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NEXT: s_endpgm
;
; GFX10-LABEL: s_test_sub_self_v2i16:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: s_test_sub_self_v2i16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -277,62 +277,62 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(ptr addrspace(1) %out, <2 x
define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
; GFX9-LABEL: v_test_sub_v2i16_constant:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_mov_b32 s4, 0x1c8007b
+; GFX9-NEXT: s_mov_b32 s0, 0x1c8007b
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_pk_sub_i16 v0, v0, s0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_test_sub_v2i16_constant:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, 0xfffffe38
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_add_u16_e32 v2, 0xff85, v0
; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v2, v0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_sub_v2i16_constant:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: v_pk_sub_i16 v0, v0, 0x1c8007b
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_sub_v2i16_constant:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: v_pk_sub_i16 v0, v0, 0x1c8007b
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -349,62 +349,62 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(ptr addrspace(1) %out, ptr
define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
; GFX9-LABEL: v_test_sub_v2i16_neg_constant:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT: s_mov_b32 s4, 0xfc21fcb3
+; GFX9-NEXT: s_mov_b32 s0, 0xfc21fcb3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_pk_sub_i16 v0, v0, s4
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
+; GFX9-NEXT: v_pk_sub_i16 v0, v0, s0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_test_sub_v2i16_neg_constant:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, 0x3df
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_add_u16_e32 v2, 0x34d, v0
; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v2, v0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_sub_v2i16_neg_constant:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: v_pk_sub_i16 v0, v0, 0xfc21fcb3
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_sub_v2i16_neg_constant:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: v_pk_sub_i16 v0, v0, 0xfc21fcb3
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -420,61 +420,61 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(ptr addrspace(1) %out,
define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
; GFX9-LABEL: v_test_sub_v2i16_inline_neg1:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: v_pk_sub_i16 v0, v0, -1
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_test_sub_v2i16_inline_neg1:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, 1
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_add_u16_e32 v2, 1, v0
; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v2, v0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_sub_v2i16_inline_neg1:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: v_pk_sub_i16 v0, v0, -1
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_sub_v2i16_inline_neg1:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: v_pk_sub_i16 v0, v0, -1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -490,60 +490,60 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(ptr addrspace(1) %out, p
define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
; GFX9-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: v_pk_sub_i16 v0, v0, 32
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
; VI-NEXT: v_subrev_u16_e32 v0, 32, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: v_pk_sub_i16 v0, v0, 32
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: v_pk_sub_i16 v0, v0, 32
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -560,60 +560,60 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(ptr addrspace(1) %
define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(ptr addrspace(1) %out, ptr addrspace(1) %in0) #1 {
; GFX9-LABEL: v_test_sub_v2i16_inline_fp_split:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc
+; GFX9-NEXT: global_load_dword v0, v0, s[6:7] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
+; GFX9-NEXT: s_mov_b32 s7, 0xf000
+; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: v_pk_sub_i16 v0, v0, 1.0
-; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_test_sub_v2i16_inline_fp_split:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: v_mov_b32_e32 v1, s7
+; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, 0xffffc080
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_sub_v2i16_inline_fp_split:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc
+; GFX10-NEXT: global_load_dword v0, v0, s[6:7] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_mov_b32 s3, 0x31016000
-; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: s_mov_b32 s7, 0x31016000
+; GFX10-NEXT: s_mov_b32 s6, -1
; GFX10-NEXT: v_pk_sub_i16 v0, v0, 1.0
-; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_test_sub_v2i16_inline_fp_split:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s7, 0x31016000
+; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: v_pk_sub_i16 v0, v0, 1.0
-; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -649,14 +649,14 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out,
; VI-LABEL: v_test_sub_v2i16_zext_to_v2i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v1, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -693,12 +693,12 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(ptr addrspace(1) %out,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s6, -1
@@ -747,14 +747,14 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
; VI-LABEL: v_test_sub_v2i16_zext_to_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -795,12 +795,12 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s6, -1
@@ -848,14 +848,14 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out,
; VI-LABEL: v_test_sub_v2i16_sext_to_v2i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -894,12 +894,12 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(ptr addrspace(1) %out,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s6, -1
@@ -948,14 +948,14 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(ptr addrspace(1) %out,
; VI-LABEL: v_test_sub_v2i16_sext_to_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[2:3]
@@ -998,12 +998,12 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(ptr addrspace(1) %out,
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
-; GFX11-NEXT: global_load_b32 v0, v0, s[0:1]
+; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_mov_b32 s7, 0x31016000
; GFX11-NEXT: s_mov_b32 s6, -1
; GFX11-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
index 7dce633e9186a..8486fbab1abad 100644
--- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll
+++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
@@ -12,11 +12,11 @@ declare void @llvm.debugtrap() #1
define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) {
; NOHSA-TRAP-GFX900-LABEL: trap:
; NOHSA-TRAP-GFX900: ; %bb.0:
-; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0
; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v1, 1
; NOHSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[0:1]
+; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[2:3]
; NOHSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
; NOHSA-TRAP-GFX900-NEXT: s_endpgm
;
@@ -103,16 +103,16 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) {
define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %arg0) local_unnamed_addr {
; NOHSA-TRAP-GFX900-LABEL: non_entry_trap:
; NOHSA-TRAP-GFX900: ; %bb.0: ; %entry
-; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0
; NOHSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; NOHSA-TRAP-GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc
+; NOHSA-TRAP-GFX900-NEXT: global_load_dword v1, v0, s[2:3] glc
; NOHSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
; NOHSA-TRAP-GFX900-NEXT: v_cmp_eq_u32_e32 vcc, -1, v1
; NOHSA-TRAP-GFX900-NEXT: s_cbranch_vccz .LBB1_2
; NOHSA-TRAP-GFX900-NEXT: ; %bb.1: ; %ret
; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v1, 3
-; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[0:1]
+; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[2:3]
; NOHSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
; NOHSA-TRAP-GFX900-NEXT: s_endpgm
; NOHSA-TRAP-GFX900-NEXT: .LBB1_2: ; %trap
@@ -267,14 +267,14 @@ ret:
define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1) {
; NOHSA-TRAP-GFX900-LABEL: trap_with_use_after:
; NOHSA-TRAP-GFX900: ; %bb.0:
-; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0
; NOHSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; NOHSA-TRAP-GFX900-NEXT: global_load_dword v1, v0, s[0:1] glc
+; NOHSA-TRAP-GFX900-NEXT: global_load_dword v1, v0, s[4:5] glc
; NOHSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
; NOHSA-TRAP-GFX900-NEXT: s_cbranch_execnz .LBB2_2
; NOHSA-TRAP-GFX900-NEXT: ; %bb.1:
-; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[2:3]
+; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[6:7]
; NOHSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
; NOHSA-TRAP-GFX900-NEXT: .LBB2_2:
; NOHSA-TRAP-GFX900-NEXT: s_endpgm
@@ -403,14 +403,14 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs
define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0) {
; NOHSA-TRAP-GFX900-LABEL: debugtrap:
; NOHSA-TRAP-GFX900: ; %bb.0:
-; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; NOHSA-TRAP-GFX900-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v0, 0
; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v1, 1
; NOHSA-TRAP-GFX900-NEXT: v_mov_b32_e32 v2, 2
; NOHSA-TRAP-GFX900-NEXT: s_waitcnt lgkmcnt(0)
-; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[0:1]
+; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v1, s[2:3]
; NOHSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
-; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v2, s[0:1]
+; NOHSA-TRAP-GFX900-NEXT: global_store_dword v0, v2, s[2:3]
; NOHSA-TRAP-GFX900-NEXT: s_waitcnt vmcnt(0)
; NOHSA-TRAP-GFX900-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
index c0c56ebb16610..b6056f6a51707 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll
@@ -104,15 +104,15 @@ define amdgpu_kernel void @truncate_high_elt_extract_vector(ptr addrspace(1) noc
; VI-LABEL: truncate_high_elt_extract_vector:
; VI: ; %bb.0: ; %bb
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s2, s[4:5], 0x0
-; VI-NEXT: s_load_dword s3, s[6:7], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: s_load_dword s0, s[4:5], 0x0
+; VI-NEXT: s_load_dword s1, s[6:7], 0x0
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_sext_i32_i16 s0, s2
-; VI-NEXT: s_sext_i32_i16 s1, s3
+; VI-NEXT: s_sext_i32_i16 s0, s0
+; VI-NEXT: s_sext_i32_i16 s1, s1
; VI-NEXT: s_mul_i32 s1, s1, s0
; VI-NEXT: s_lshr_b32 s0, s1, 16
; VI-NEXT: v_mov_b32_e32 v2, s0
diff --git a/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll b/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll
index e668c1d2b7f3d..d9e0e0298e072 100644
--- a/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll
+++ b/llvm/test/CodeGen/AMDGPU/twoaddr-constrain.ll
@@ -11,11 +11,11 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg
; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY killed $sgpr1
; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0:sreg_64 = COPY killed [[COPY]]
; CHECK-NEXT: [[COPY2:%[0-9]+]].sub1:sreg_64 = COPY killed [[COPY1]]
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (invariant load (<2 x s32>) from %ir.ptr, align 4, addrspace 4)
+ ; CHECK-NEXT: early-clobber %11:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec [[COPY2]], 0, 0 :: (invariant load (<2 x s32>) from %ir.ptr, align 4, addrspace 4)
; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed [[COPY2]], 8, 0 :: (invariant load (s32) from %ir.ptr + 8, addrspace 4)
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY %11.sub0
; CHECK-NEXT: $sgpr0 = COPY killed [[COPY3]]
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY killed [[S_LOAD_DWORDX2_IMM]].sub1
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY killed %11.sub1
; CHECK-NEXT: $sgpr1 = COPY killed [[COPY4]]
; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY killed [[S_LOAD_DWORD_IMM]]
; CHECK-NEXT: $sgpr2 = COPY killed [[COPY5]]
diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll
index 416dbb226422c..eb457766eef39 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll
@@ -28,12 +28,12 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; VI-LABEL: s_uaddo_i64_zext:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: s_add_u32 s0, s6, s0
+; VI-NEXT: s_add_u32 s0, s6, s2
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: s_addc_u32 s1, s7, s1
+; VI-NEXT: s_addc_u32 s1, s7, s3
; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
@@ -96,12 +96,12 @@ define amdgpu_kernel void @s_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; VI-LABEL: s_uaddo_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: v_mov_b32_e32 v4, s3
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
+; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v4
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
@@ -161,18 +161,18 @@ define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; VI-LABEL: v_uaddo_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: flat_load_dword v4, v[0:1]
+; VI-NEXT: flat_load_dword v5, v[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: flat_load_dword v5, v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v5
; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
@@ -182,16 +182,16 @@ define amdgpu_kernel void @v_uaddo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: v_uaddo_i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
-; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX9-NEXT: global_load_dword v1, v0, s[8:9]
+; GFX9-NEXT: global_load_dword v2, v0, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX9-NEXT: global_store_byte v0, v2, s[2:3]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT: global_store_byte v0, v2, s[6:7]
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -243,18 +243,18 @@ define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspac
;
; VI-LABEL: v_uaddo_i32_novcc:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: flat_load_dword v4, v[0:1]
+; VI-NEXT: flat_load_dword v5, v[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: flat_load_dword v5, v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v5
; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
@@ -268,19 +268,19 @@ define amdgpu_kernel void @v_uaddo_i32_novcc(ptr addrspace(1) %out, ptr addrspac
;
; GFX9-LABEL: v_uaddo_i32_novcc:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
-; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX9-NEXT: global_load_dword v1, v0, s[8:9]
+; GFX9-NEXT: global_load_dword v2, v0, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: global_store_byte v0, v2, s[2:3]
+; GFX9-NEXT: global_store_byte v0, v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -325,19 +325,19 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
;
; VI-LABEL: s_uaddo_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_add_u32 s0, s4, s6
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_addc_u32 s1, s5, s7
-; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: s_add_u32 s0, s8, s10
+; VI-NEXT: v_mov_b32_e32 v4, s8
+; VI-NEXT: s_addc_u32 s1, s9, s11
+; VI-NEXT: v_mov_b32_e32 v5, s9
; VI-NEXT: v_mov_b32_e32 v7, s1
; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5]
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v6, s0
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx2 v[0:1], v[6:7]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; VI-NEXT: flat_store_byte v[2:3], v0
@@ -345,19 +345,19 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: s_uaddo_i64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_add_u32 s6, s4, s6
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_addc_u32 s7, s5, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: s_add_u32 s0, s8, s10
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_addc_u32 s1, s9, s11
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
-; GFX9-NEXT: global_store_byte v4, v0, s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
+; GFX9-NEXT: global_store_byte v4, v0, s[6:7]
; GFX9-NEXT: s_endpgm
%uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
%val = extractvalue { i64, i1 } %uadd, 0
@@ -401,18 +401,18 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
;
; VI-LABEL: v_uaddo_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v6, s2
-; VI-NEXT: v_mov_b32_e32 v7, s3
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v6, s6
+; VI-NEXT: v_mov_b32_e32 v7, s7
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, v1, v3, vcc
@@ -424,18 +424,18 @@ define amdgpu_kernel void @v_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: v_uaddo_i64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[8:9]
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: global_store_byte v4, v0, s[2:3]
+; GFX9-NEXT: global_store_byte v4, v0, s[6:7]
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -486,18 +486,18 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
;
; VI-LABEL: v_uaddo_i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: flat_load_ushort v4, v[0:1]
+; VI-NEXT: flat_load_ushort v5, v[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: flat_load_ushort v4, v[0:1]
-; VI-NEXT: flat_load_ushort v5, v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u16_e32 v5, v4, v5
; VI-NEXT: v_cmp_lt_u16_e32 vcc, v5, v4
@@ -508,17 +508,17 @@ define amdgpu_kernel void @v_uaddo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: v_uaddo_i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[4:5]
-; GFX9-NEXT: global_load_ushort v2, v0, s[6:7]
+; GFX9-NEXT: global_load_ushort v1, v0, s[8:9]
+; GFX9-NEXT: global_load_ushort v2, v0, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u16_e32 v2, v1, v2
; GFX9-NEXT: v_cmp_lt_u16_e32 vcc, v2, v1
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT: global_store_short v0, v2, s[0:1]
-; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
+; GFX9-NEXT: global_store_short v0, v2, s[4:5]
+; GFX9-NEXT: global_store_byte v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -568,18 +568,18 @@ define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
;
; VI-LABEL: v_uaddo_v2i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v6, s2
-; VI-NEXT: v_mov_b32_e32 v7, s3
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v6, s6
+; VI-NEXT: v_mov_b32_e32 v7, s7
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3
; VI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
@@ -591,18 +591,18 @@ define amdgpu_kernel void @v_uaddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX9-LABEL: v_uaddo_v2i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[8:9]
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
+; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7]
; GFX9-NEXT: s_endpgm
%a = load <2 x i32>, ptr addrspace(1) %aptr, align 4
%b = load <2 x i32>, ptr addrspace(1) %bptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index c7952f561427c..8e75127951e3a 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -44,17 +44,17 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
;
; VI-LABEL: udiv_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_u32_e32 v2, v1
; VI-NEXT: v_sub_u32_e32 v3, vcc, 0, v1
@@ -75,7 +75,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
; VI-NEXT: v_add_u32_e32 v3, vcc, 1, v2
; VI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GCN-LABEL: udiv_i32:
@@ -401,17 +401,17 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; VI-LABEL: udiv_v2i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_mov_b32 s2, s6
+; VI-NEXT: s_mov_b32 s3, s7
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
-; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s10
+; VI-NEXT: s_mov_b32 s1, s11
+; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: s_mov_b32 s4, s8
+; VI-NEXT: s_mov_b32 s5, s9
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_u32_e32 v4, v2
; VI-NEXT: v_cvt_f32_u32_e32 v5, v3
@@ -714,18 +714,18 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; VI-LABEL: udiv_v4i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s11, 0xf000
; VI-NEXT: s_mov_b32 s10, -1
-; VI-NEXT: s_mov_b32 s6, s10
-; VI-NEXT: s_mov_b32 s7, s11
+; VI-NEXT: s_mov_b32 s2, s10
+; VI-NEXT: s_mov_b32 s3, s11
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
-; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 offset:16
-; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0
-; VI-NEXT: s_mov_b32 s8, s0
-; VI-NEXT: s_mov_b32 s9, s1
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
+; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0
+; VI-NEXT: s_mov_b32 s8, s4
+; VI-NEXT: s_mov_b32 s9, s5
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_cvt_f32_u32_e32 v8, v0
; VI-NEXT: v_cvt_f32_u32_e32 v10, v1
@@ -1116,20 +1116,20 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac
;
; VI-LABEL: udiv_i32_div_pow2:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v0, 4, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GCN-LABEL: udiv_i32_div_pow2:
@@ -1203,22 +1203,22 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp
;
; VI-LABEL: udiv_i32_div_k_even:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s2, 0xfabbd9c1
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, 0xfabbd9c1
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mul_hi_u32 v0, v0, s2
+; VI-NEXT: v_mul_hi_u32 v0, v0, s0
+; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: v_lshrrev_b32_e32 v0, 25, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GCN-LABEL: udiv_i32_div_k_even:
@@ -1297,22 +1297,22 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa
;
; VI-LABEL: udiv_i32_div_k_odd:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s2, 0x7d5deca3
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, 0x7d5deca3
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mul_hi_u32 v0, v0, s2
+; VI-NEXT: v_mul_hi_u32 v0, v0, s0
+; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: v_lshrrev_b32_e32 v0, 24, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GCN-LABEL: udiv_i32_div_k_odd:
@@ -1400,18 +1400,18 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
;
; VI-LABEL: v_udiv_i8:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:1
; VI-NEXT: buffer_load_ubyte v1, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
; VI-NEXT: v_rcp_iflag_f32_e32 v2, v0
@@ -1424,7 +1424,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
; VI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc
; VI-NEXT: v_and_b32_e32 v0, 0xff, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GCN-LABEL: v_udiv_i8:
@@ -1540,18 +1540,18 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; VI-LABEL: v_udiv_i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0 offset:2
; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_cvt_f32_u32_e32 v0, v0
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1564,7 +1564,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
; VI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc
; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GCN-LABEL: v_udiv_i16:
@@ -1688,20 +1688,20 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; VI-LABEL: v_udiv_i23:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:6
; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4
; VI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2
; VI-NEXT: buffer_load_ushort v3, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: s_waitcnt vmcnt(2)
@@ -1720,7 +1720,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
; VI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc
; VI-NEXT: v_and_b32_e32 v0, 0x7fffff, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GCN-LABEL: v_udiv_i23:
@@ -1885,20 +1885,20 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i
;
; VI-LABEL: v_udiv_i24:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0 offset:6
; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0 offset:4
; VI-NEXT: buffer_load_ubyte v2, off, s[8:11], 0 offset:2
; VI-NEXT: buffer_load_ushort v3, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: s_waitcnt vmcnt(2)
@@ -1917,7 +1917,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
; VI-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc
; VI-NEXT: v_and_b32_e32 v0, 0xffffff, v0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GCN-LABEL: v_udiv_i24:
@@ -2076,30 +2076,30 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read
;
; VI-LABEL: scalarize_mulhu_4xi32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
-; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; VI-NEXT: s_mov_b32 s0, 0x1389c755
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
+; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT: s_mov_b32 s4, 0x1389c755
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v0, 2, v0
; VI-NEXT: v_lshrrev_b32_e32 v1, 2, v1
; VI-NEXT: v_lshrrev_b32_e32 v2, 2, v2
; VI-NEXT: v_lshrrev_b32_e32 v3, 2, v3
-; VI-NEXT: v_mul_hi_u32 v0, v0, s0
-; VI-NEXT: v_mul_hi_u32 v1, v1, s0
-; VI-NEXT: v_mul_hi_u32 v2, v2, s0
-; VI-NEXT: v_mul_hi_u32 v3, v3, s0
+; VI-NEXT: v_mul_hi_u32 v0, v0, s4
+; VI-NEXT: v_mul_hi_u32 v1, v1, s4
+; VI-NEXT: v_mul_hi_u32 v2, v2, s4
+; VI-NEXT: v_mul_hi_u32 v3, v3, s4
; VI-NEXT: v_lshrrev_b32_e32 v0, 10, v0
; VI-NEXT: v_lshrrev_b32_e32 v1, 10, v1
; VI-NEXT: v_lshrrev_b32_e32 v2, 10, v2
; VI-NEXT: v_lshrrev_b32_e32 v3, 10, v3
-; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GCN-LABEL: scalarize_mulhu_4xi32:
diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll
index f0f0b6680e0e6..0bb21278918c3 100644
--- a/llvm/test/CodeGen/AMDGPU/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll
@@ -77,37 +77,37 @@ define amdgpu_kernel void @test_udivrem(ptr addrspace(1) %out0, [8 x i32], ptr a
;
; GFX8-LABEL: test_udivrem:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dword s4, s[0:1], 0x98
-; GFX8-NEXT: s_load_dword s5, s[0:1], 0x74
+; GFX8-NEXT: s_load_dword s6, s[0:1], 0x98
+; GFX8-NEXT: s_load_dword s7, s[0:1], 0x74
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s4
-; GFX8-NEXT: s_sub_i32 s2, 0, s4
+; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s6
+; GFX8-NEXT: s_sub_i32 s2, 0, s6
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX8-NEXT: v_mul_lo_u32 v1, s2, v0
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4c
+; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c
; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1
-; GFX8-NEXT: v_mul_hi_u32 v4, s5, v0
+; GFX8-NEXT: v_mul_hi_u32 v4, s7, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_readfirstlane_b32 s0, v4
-; GFX8-NEXT: s_mul_i32 s0, s0, s4
-; GFX8-NEXT: s_sub_i32 s0, s5, s0
-; GFX8-NEXT: s_sub_i32 s1, s0, s4
+; GFX8-NEXT: s_mul_i32 s0, s0, s6
+; GFX8-NEXT: s_sub_i32 s0, s7, s0
+; GFX8-NEXT: s_sub_i32 s1, s0, s6
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 1, v4
-; GFX8-NEXT: s_cmp_ge_u32 s0, s4
+; GFX8-NEXT: s_cmp_ge_u32 s0, s6
; GFX8-NEXT: s_cselect_b64 vcc, -1, 0
; GFX8-NEXT: s_cselect_b32 s0, s1, s0
; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX8-NEXT: s_sub_i32 s1, s0, s4
+; GFX8-NEXT: s_sub_i32 s1, s0, s6
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 1, v4
-; GFX8-NEXT: s_cmp_ge_u32 s0, s4
+; GFX8-NEXT: s_cmp_ge_u32 s0, s6
; GFX8-NEXT: s_cselect_b64 vcc, -1, 0
; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
; GFX8-NEXT: s_cselect_b32 s0, s1, s0
@@ -212,7 +212,6 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x,
; GFX8-LABEL: test_udivrem_v2:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX8-NEXT: s_sub_i32 s2, 0, s6
@@ -227,7 +226,6 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x,
; GFX8-NEXT: v_mul_hi_u32 v0, s4, v0
; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_readfirstlane_b32 s2, v0
; GFX8-NEXT: s_mul_i32 s2, s2, s6
; GFX8-NEXT: s_sub_i32 s2, s4, s2
@@ -236,24 +234,27 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x,
; GFX8-NEXT: s_cselect_b32 s2, s3, s2
; GFX8-NEXT: s_sub_i32 s3, s2, s6
; GFX8-NEXT: s_cmp_ge_u32 s2, s6
-; GFX8-NEXT: s_cselect_b32 s2, s3, s2
-; GFX8-NEXT: s_sub_i32 s3, 0, s7
-; GFX8-NEXT: v_mul_lo_u32 v0, s3, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_cselect_b32 s4, s3, s2
+; GFX8-NEXT: s_sub_i32 s2, 0, s7
+; GFX8-NEXT: v_mul_lo_u32 v0, s2, v1
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; GFX8-NEXT: v_mul_hi_u32 v1, s5, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: s_mul_i32 s2, s2, s7
-; GFX8-NEXT: s_sub_i32 s2, s5, s2
-; GFX8-NEXT: s_sub_i32 s3, s2, s7
-; GFX8-NEXT: s_cmp_ge_u32 s2, s7
-; GFX8-NEXT: s_cselect_b32 s2, s3, s2
-; GFX8-NEXT: s_sub_i32 s3, s2, s7
-; GFX8-NEXT: s_cmp_ge_u32 s2, s7
-; GFX8-NEXT: s_cselect_b32 s2, s3, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s2
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_readfirstlane_b32 s0, v1
+; GFX8-NEXT: s_mul_i32 s0, s0, s7
+; GFX8-NEXT: s_sub_i32 s0, s5, s0
+; GFX8-NEXT: s_sub_i32 s1, s0, s7
+; GFX8-NEXT: s_cmp_ge_u32 s0, s7
+; GFX8-NEXT: s_cselect_b32 s0, s1, s0
+; GFX8-NEXT: s_sub_i32 s1, s0, s7
+; GFX8-NEXT: s_cmp_ge_u32 s0, s7
+; GFX8-NEXT: s_cselect_b32 s0, s1, s0
+; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
%result0 = udiv <2 x i32> %x, %y
@@ -419,14 +420,11 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x,
; GFX8-LABEL: test_udivrem_v4:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x34
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s8
; GFX8-NEXT: s_sub_i32 s2, 0, s8
; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s9
-; GFX8-NEXT: v_mov_b32_e32 v5, s1
; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0
@@ -445,9 +443,9 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x,
; GFX8-NEXT: s_cselect_b32 s2, s3, s2
; GFX8-NEXT: s_sub_i32 s3, s2, s8
; GFX8-NEXT: s_cmp_ge_u32 s2, s8
-; GFX8-NEXT: s_cselect_b32 s2, s3, s2
-; GFX8-NEXT: s_sub_i32 s3, 0, s9
-; GFX8-NEXT: v_mul_lo_u32 v0, s3, v1
+; GFX8-NEXT: s_cselect_b32 s4, s3, s2
+; GFX8-NEXT: s_sub_i32 s2, 0, s9
+; GFX8-NEXT: v_mul_lo_u32 v0, s2, v1
; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
@@ -455,50 +453,54 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x,
; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s11
-; GFX8-NEXT: v_readfirstlane_b32 s3, v0
-; GFX8-NEXT: s_mul_i32 s3, s3, s9
-; GFX8-NEXT: s_sub_i32 s3, s5, s3
-; GFX8-NEXT: s_sub_i32 s4, s3, s9
-; GFX8-NEXT: s_cmp_ge_u32 s3, s9
-; GFX8-NEXT: s_cselect_b32 s3, s4, s3
-; GFX8-NEXT: s_sub_i32 s4, s3, s9
-; GFX8-NEXT: s_cmp_ge_u32 s3, s9
-; GFX8-NEXT: s_cselect_b32 s3, s4, s3
-; GFX8-NEXT: s_sub_i32 s4, 0, s10
-; GFX8-NEXT: v_mul_lo_u32 v0, s4, v1
+; GFX8-NEXT: v_readfirstlane_b32 s2, v0
+; GFX8-NEXT: s_mul_i32 s2, s2, s9
+; GFX8-NEXT: s_sub_i32 s2, s5, s2
+; GFX8-NEXT: s_sub_i32 s3, s2, s9
+; GFX8-NEXT: s_cmp_ge_u32 s2, s9
+; GFX8-NEXT: s_cselect_b32 s2, s3, s2
+; GFX8-NEXT: s_sub_i32 s3, s2, s9
+; GFX8-NEXT: s_cmp_ge_u32 s2, s9
+; GFX8-NEXT: s_cselect_b32 s5, s3, s2
+; GFX8-NEXT: s_sub_i32 s2, 0, s10
+; GFX8-NEXT: v_mul_lo_u32 v0, s2, v1
; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; GFX8-NEXT: v_mul_hi_u32 v0, s6, v0
; GFX8-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v2
; GFX8-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX8-NEXT: v_readfirstlane_b32 s4, v0
-; GFX8-NEXT: s_mul_i32 s4, s4, s10
-; GFX8-NEXT: s_sub_i32 s4, s6, s4
-; GFX8-NEXT: s_sub_i32 s5, s4, s10
-; GFX8-NEXT: s_cmp_ge_u32 s4, s10
-; GFX8-NEXT: s_cselect_b32 s4, s5, s4
-; GFX8-NEXT: s_sub_i32 s5, s4, s10
-; GFX8-NEXT: s_cmp_ge_u32 s4, s10
-; GFX8-NEXT: s_cselect_b32 s4, s5, s4
-; GFX8-NEXT: s_sub_i32 s5, 0, s11
-; GFX8-NEXT: v_mul_lo_u32 v0, s5, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_readfirstlane_b32 s2, v0
+; GFX8-NEXT: s_mul_i32 s2, s2, s10
+; GFX8-NEXT: s_sub_i32 s2, s6, s2
+; GFX8-NEXT: s_sub_i32 s3, s2, s10
+; GFX8-NEXT: s_cmp_ge_u32 s2, s10
+; GFX8-NEXT: s_cselect_b32 s2, s3, s2
+; GFX8-NEXT: s_sub_i32 s3, s2, s10
+; GFX8-NEXT: s_cmp_ge_u32 s2, s10
+; GFX8-NEXT: s_cselect_b32 s6, s3, s2
+; GFX8-NEXT: s_sub_i32 s2, 0, s11
+; GFX8-NEXT: v_mul_lo_u32 v0, s2, v1
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: v_mul_hi_u32 v0, v1, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v5, s3
+; GFX8-NEXT: v_mov_b32_e32 v4, s2
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v1, v0
; GFX8-NEXT: v_mul_hi_u32 v3, s7, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s2
-; GFX8-NEXT: v_mov_b32_e32 v1, s3
-; GFX8-NEXT: v_readfirstlane_b32 s2, v3
-; GFX8-NEXT: s_mul_i32 s2, s2, s11
-; GFX8-NEXT: s_sub_i32 s2, s7, s2
-; GFX8-NEXT: s_sub_i32 s3, s2, s11
-; GFX8-NEXT: s_cmp_ge_u32 s2, s11
-; GFX8-NEXT: s_cselect_b32 s2, s3, s2
-; GFX8-NEXT: s_sub_i32 s3, s2, s11
-; GFX8-NEXT: s_cmp_ge_u32 s2, s11
-; GFX8-NEXT: s_cselect_b32 s2, s3, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s2
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_readfirstlane_b32 s0, v3
+; GFX8-NEXT: s_mul_i32 s0, s0, s11
+; GFX8-NEXT: s_sub_i32 s0, s7, s0
+; GFX8-NEXT: s_sub_i32 s1, s0, s11
+; GFX8-NEXT: s_cmp_ge_u32 s0, s11
+; GFX8-NEXT: s_cselect_b32 s0, s1, s0
+; GFX8-NEXT: s_sub_i32 s1, s0, s11
+; GFX8-NEXT: s_cmp_ge_u32 s0, s11
+; GFX8-NEXT: s_cselect_b32 s0, s1, s0
+; GFX8-NEXT: v_mov_b32_e32 v3, s0
; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GFX8-NEXT: s_endpgm
%result0 = udiv <4 x i32> %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
index 79b0a966bc1fb..2a4066d5bf8c2 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
@@ -28,42 +28,42 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, i64 %i
;
; GFX8-LABEL: s_uint_to_fp_i64_to_f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_flbit_i32_b32 s4, s3
-; GFX8-NEXT: s_min_u32 s4, s4, 32
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
-; GFX8-NEXT: s_min_u32 s2, s2, 1
-; GFX8-NEXT: s_or_b32 s2, s3, s2
-; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX8-NEXT: s_sub_i32 s2, 32, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_ldexp_f32 v0, v0, s2
+; GFX8-NEXT: s_flbit_i32_b32 s0, s7
+; GFX8-NEXT: s_min_u32 s2, s0, 32
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[6:7], s2
+; GFX8-NEXT: s_min_u32 s0, s0, 1
+; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s0
+; GFX8-NEXT: s_sub_i32 s0, 32, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
+; GFX8-NEXT: v_ldexp_f32 v0, v0, s0
; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX11-LABEL: s_uint_to_fp_i64_to_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_clz_i32_u32 s4, s3
+; GFX11-NEXT: s_clz_i32_u32 s0, s7
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_min_u32 s4, s4, 32
-; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
+; GFX11-NEXT: s_min_u32 s2, s0, 32
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[6:7], s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_min_u32 s2, s2, 1
-; GFX11-NEXT: s_or_b32 s2, s3, s2
+; GFX11-NEXT: s_min_u32 s0, s0, 1
+; GFX11-NEXT: s_or_b32 s0, s1, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX11-NEXT: s_sub_i32 s2, 32, s4
+; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s0
+; GFX11-NEXT: s_sub_i32 s0, 32, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_ldexp_f32 v0, v0, s2
+; GFX11-NEXT: v_ldexp_f32 v0, v0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -100,12 +100,12 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad
;
; GFX8-LABEL: v_uint_to_fp_i64_to_f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -116,8 +116,8 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad
; GFX8-NEXT: v_min_u32_e32 v1, 1, v1
; GFX8-NEXT: v_or_b32_e32 v1, v2, v1
; GFX8-NEXT: v_cvt_f32_u32_e32 v1, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-NEXT: v_mov_b32_e32 v2, s5
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0
; GFX8-NEXT: v_ldexp_f32 v1, v1, v3
; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v1
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
@@ -126,11 +126,11 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad
;
; GFX11-LABEL: v_uint_to_fp_i64_to_f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3]
+; GFX11-NEXT: global_load_b64 v[1:2], v1, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_clz_i32_u32_e32 v3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -145,7 +145,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(ptr addrspace(1) %out, ptr ad
; GFX11-NEXT: v_ldexp_f32 v1, v1, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -180,39 +180,39 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, i64 %i
;
; GFX8-LABEL: s_uint_to_fp_i64_to_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_flbit_i32_b32 s4, s3
-; GFX8-NEXT: s_min_u32 s4, s4, 32
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
-; GFX8-NEXT: s_min_u32 s2, s2, 1
-; GFX8-NEXT: s_or_b32 s2, s3, s2
-; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: s_sub_i32 s0, 32, s4
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: s_flbit_i32_b32 s0, s7
+; GFX8-NEXT: s_min_u32 s2, s0, 32
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[6:7], s2
+; GFX8-NEXT: s_min_u32 s0, s0, 1
+; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s0
+; GFX8-NEXT: s_sub_i32 s0, 32, s2
+; GFX8-NEXT: v_mov_b32_e32 v0, s4
+; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_ldexp_f32 v2, v2, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX11-LABEL: s_uint_to_fp_i64_to_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_clz_i32_u32 s4, s3
+; GFX11-NEXT: s_clz_i32_u32 s0, s7
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_min_u32 s4, s4, 32
-; GFX11-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
+; GFX11-NEXT: s_min_u32 s2, s0, 32
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[6:7], s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_min_u32 s2, s2, 1
-; GFX11-NEXT: s_or_b32 s2, s3, s2
+; GFX11-NEXT: s_min_u32 s0, s0, 1
+; GFX11-NEXT: s_or_b32 s0, s1, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX11-NEXT: s_sub_i32 s2, 32, s4
+; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s0
+; GFX11-NEXT: s_sub_i32 s0, 32, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_ldexp_f32 v0, v0, s2
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_ldexp_f32 v0, v0, s0
+; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -248,23 +248,23 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad
;
; GFX8-LABEL: v_uint_to_fp_i64_to_f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 2, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_ffbh_u32_e32 v0, v2
; GFX8-NEXT: v_min_u32_e32 v4, 32, v0
; GFX8-NEXT: v_lshlrev_b64 v[0:1], v4, v[1:2]
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: v_mov_b32_e32 v2, s5
; GFX8-NEXT: v_min_u32_e32 v0, 1, v0
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: v_cvt_f32_u32_e32 v5, v0
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v3
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc
; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 32, v4
; GFX8-NEXT: v_ldexp_f32 v2, v5, v2
@@ -273,11 +273,11 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad
;
; GFX11-LABEL: v_uint_to_fp_i64_to_f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[1:2], v1, s[2:3]
+; GFX11-NEXT: global_load_b64 v[1:2], v1, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_clz_i32_u32_e32 v3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -290,7 +290,7 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(ptr addrspace(1) %out, ptr ad
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX11-NEXT: v_ldexp_f32 v1, v1, v2
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -333,26 +333,26 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2
; GFX8-LABEL: s_uint_to_fp_v2i64_to_v2f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_flbit_i32_b32 s2, s7
-; GFX8-NEXT: s_flbit_i32_b32 s3, s5
-; GFX8-NEXT: s_min_u32 s8, s2, 32
-; GFX8-NEXT: s_min_u32 s9, s3, 32
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s8
-; GFX8-NEXT: s_min_u32 s2, s2, 1
+; GFX8-NEXT: s_flbit_i32_b32 s0, s7
+; GFX8-NEXT: s_flbit_i32_b32 s1, s5
+; GFX8-NEXT: s_min_u32 s8, s0, 32
+; GFX8-NEXT: s_min_u32 s9, s1, 32
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[6:7], s8
+; GFX8-NEXT: s_min_u32 s0, s0, 1
; GFX8-NEXT: s_lshl_b64 s[4:5], s[4:5], s9
-; GFX8-NEXT: s_or_b32 s2, s3, s2
-; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX8-NEXT: s_min_u32 s2, s4, 1
-; GFX8-NEXT: s_or_b32 s2, s5, s2
-; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s2
-; GFX8-NEXT: s_sub_i32 s2, 32, s8
-; GFX8-NEXT: v_ldexp_f32 v1, v0, s2
-; GFX8-NEXT: s_sub_i32 s2, 32, s9
-; GFX8-NEXT: v_ldexp_f32 v0, v2, s2
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
-; GFX8-NEXT: v_mov_b32_e32 v2, s0
+; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s0
+; GFX8-NEXT: s_min_u32 s0, s4, 1
+; GFX8-NEXT: s_or_b32 s0, s5, s0
+; GFX8-NEXT: v_cvt_f32_u32_e32 v2, s0
+; GFX8-NEXT: s_sub_i32 s0, 32, s8
+; GFX8-NEXT: v_ldexp_f32 v1, v0, s0
+; GFX8-NEXT: s_sub_i32 s0, 32, s9
+; GFX8-NEXT: v_ldexp_f32 v0, v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX8-NEXT: s_endpgm
;
@@ -360,27 +360,27 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f32(ptr addrspace(1) %out, <2
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v3, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_clz_i32_u32 s2, s7
-; GFX11-NEXT: s_clz_i32_u32 s3, s5
-; GFX11-NEXT: s_min_u32 s8, s2, 32
-; GFX11-NEXT: s_min_u32 s9, s3, 32
-; GFX11-NEXT: s_lshl_b64 s[2:3], s[6:7], s8
+; GFX11-NEXT: s_clz_i32_u32 s0, s7
+; GFX11-NEXT: s_clz_i32_u32 s1, s5
+; GFX11-NEXT: s_min_u32 s8, s0, 32
+; GFX11-NEXT: s_min_u32 s9, s1, 32
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[6:7], s8
; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s9
-; GFX11-NEXT: s_min_u32 s2, s2, 1
+; GFX11-NEXT: s_min_u32 s0, s0, 1
; GFX11-NEXT: s_min_u32 s4, s4, 1
-; GFX11-NEXT: s_or_b32 s2, s3, s2
-; GFX11-NEXT: s_or_b32 s3, s5, s4
-; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX11-NEXT: v_cvt_f32_u32_e32 v2, s3
-; GFX11-NEXT: s_sub_i32 s2, 32, s8
-; GFX11-NEXT: s_sub_i32 s3, 32, s9
+; GFX11-NEXT: s_or_b32 s0, s1, s0
+; GFX11-NEXT: s_or_b32 s1, s5, s4
+; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s0
+; GFX11-NEXT: v_cvt_f32_u32_e32 v2, s1
+; GFX11-NEXT: s_sub_i32 s0, 32, s8
+; GFX11-NEXT: s_sub_i32 s1, 32, s9
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_ldexp_f32 v1, v0, s2
-; GFX11-NEXT: v_ldexp_f32 v0, v2, s3
-; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-NEXT: v_ldexp_f32 v1, v0, s0
+; GFX11-NEXT: v_ldexp_f32 v0, v2, s1
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -443,19 +443,19 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt
;
; GFX8-LABEL: v_uint_to_fp_v4i64_to_v4f32:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, s2, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s6, v1
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_dwordx4 v[1:4], v[5:6]
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 16, v5
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
; GFX8-NEXT: flat_load_dwordx4 v[5:8], v[5:6]
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, s0, v0
-; GFX8-NEXT: v_mov_b32_e32 v10, s1
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, s4, v0
+; GFX8-NEXT: v_mov_b32_e32 v10, s5
; GFX8-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_ffbh_u32_e32 v0, v4
@@ -496,12 +496,12 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt
;
; GFX11-LABEL: v_uint_to_fp_v4i64_to_v4f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b128 v[1:4], v5, s[2:3] offset:16
-; GFX11-NEXT: global_load_b128 v[5:8], v5, s[2:3]
+; GFX11-NEXT: global_load_b128 v[1:4], v5, s[6:7] offset:16
+; GFX11-NEXT: global_load_b128 v[5:8], v5, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_clz_i32_u32_e32 v9, v4
; GFX11-NEXT: v_clz_i32_u32_e32 v10, v2
@@ -540,7 +540,7 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(ptr addrspace(1) %out, pt
; GFX11-NEXT: v_ldexp_f32 v2, v1, v10
; GFX11-NEXT: v_ldexp_f32 v1, v6, v11
; GFX11-NEXT: v_ldexp_f32 v0, v4, v5
-; GFX11-NEXT: global_store_b128 v7, v[0:3], s[0:1]
+; GFX11-NEXT: global_store_b128 v7, v[0:3], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -587,29 +587,29 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2
; GFX8-LABEL: s_uint_to_fp_v2i64_to_v2f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_flbit_i32_b32 s2, s7
-; GFX8-NEXT: s_flbit_i32_b32 s3, s5
-; GFX8-NEXT: s_min_u32 s8, s2, 32
-; GFX8-NEXT: s_min_u32 s9, s3, 32
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[6:7], s8
-; GFX8-NEXT: s_min_u32 s2, s2, 1
-; GFX8-NEXT: s_or_b32 s2, s3, s2
-; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX8-NEXT: s_lshl_b64 s[2:3], s[4:5], s9
-; GFX8-NEXT: s_min_u32 s2, s2, 1
-; GFX8-NEXT: s_or_b32 s2, s3, s2
-; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s2
+; GFX8-NEXT: s_flbit_i32_b32 s0, s7
+; GFX8-NEXT: s_flbit_i32_b32 s1, s5
+; GFX8-NEXT: s_min_u32 s8, s0, 32
+; GFX8-NEXT: s_min_u32 s9, s1, 32
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[6:7], s8
+; GFX8-NEXT: s_min_u32 s0, s0, 1
+; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s0
+; GFX8-NEXT: s_lshl_b64 s[0:1], s[4:5], s9
+; GFX8-NEXT: s_min_u32 s0, s0, 1
+; GFX8-NEXT: s_or_b32 s0, s1, s0
+; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s0
; GFX8-NEXT: s_sub_i32 s6, 32, s8
-; GFX8-NEXT: s_sub_i32 s2, 32, s9
+; GFX8-NEXT: s_sub_i32 s0, 32, s9
; GFX8-NEXT: v_ldexp_f32 v0, v0, s6
-; GFX8-NEXT: v_ldexp_f32 v1, v1, s2
+; GFX8-NEXT: v_ldexp_f32 v1, v1, s0
; GFX8-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-NEXT: v_or_b32_e32 v2, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_mov_b32_e32 v0, s2
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -617,32 +617,32 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(ptr addrspace(1) %out, <2
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x34
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_clz_i32_u32 s2, s7
-; GFX11-NEXT: s_clz_i32_u32 s3, s5
-; GFX11-NEXT: s_min_u32 s8, s2, 32
-; GFX11-NEXT: s_min_u32 s9, s3, 32
-; GFX11-NEXT: s_lshl_b64 s[2:3], s[6:7], s8
+; GFX11-NEXT: s_clz_i32_u32 s0, s7
+; GFX11-NEXT: s_clz_i32_u32 s1, s5
+; GFX11-NEXT: s_min_u32 s8, s0, 32
+; GFX11-NEXT: s_min_u32 s9, s1, 32
+; GFX11-NEXT: s_lshl_b64 s[0:1], s[6:7], s8
; GFX11-NEXT: s_lshl_b64 s[4:5], s[4:5], s9
-; GFX11-NEXT: s_min_u32 s2, s2, 1
+; GFX11-NEXT: s_min_u32 s0, s0, 1
; GFX11-NEXT: s_min_u32 s4, s4, 1
-; GFX11-NEXT: s_or_b32 s2, s3, s2
-; GFX11-NEXT: s_or_b32 s3, s5, s4
-; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s2
-; GFX11-NEXT: v_cvt_f32_u32_e32 v1, s3
-; GFX11-NEXT: s_sub_i32 s2, 32, s8
-; GFX11-NEXT: s_sub_i32 s3, 32, s9
+; GFX11-NEXT: s_or_b32 s0, s1, s0
+; GFX11-NEXT: s_or_b32 s1, s5, s4
+; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s0
+; GFX11-NEXT: v_cvt_f32_u32_e32 v1, s1
+; GFX11-NEXT: s_sub_i32 s0, 32, s8
+; GFX11-NEXT: s_sub_i32 s1, 32, s9
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_ldexp_f32 v0, v0, s2
-; GFX11-NEXT: v_ldexp_f32 v1, v1, s3
+; GFX11-NEXT: v_ldexp_f32 v0, v0, s0
+; GFX11-NEXT: v_ldexp_f32 v1, v1, s1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX11-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v2, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -713,18 +713,18 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt
;
; GFX8-LABEL: v_uint_to_fp_v4i64_to_v4f16:
; GFX8: ; %bb.0:
-; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 5, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v9, 3, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, s2, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, s7
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, s6, v1
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v2, vcc
; GFX8-NEXT: flat_load_dwordx4 v[1:4], v[5:6]
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 16, v5
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
; GFX8-NEXT: flat_load_dwordx4 v[5:8], v[5:6]
-; GFX8-NEXT: v_mov_b32_e32 v10, s1
+; GFX8-NEXT: v_mov_b32_e32 v10, s5
; GFX8-NEXT: s_waitcnt vmcnt(1)
; GFX8-NEXT: v_ffbh_u32_e32 v0, v4
; GFX8-NEXT: v_ffbh_u32_e32 v11, v2
@@ -763,7 +763,7 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt
; GFX8-NEXT: v_cvt_f16_f32_e32 v4, v0
; GFX8-NEXT: v_cvt_f16_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; GFX8-NEXT: v_cvt_f16_f32_e32 v6, v2
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v9
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v9
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v10, vcc
; GFX8-NEXT: v_or_b32_e32 v2, v4, v3
; GFX8-NEXT: v_or_b32_e32 v3, v6, v5
@@ -772,12 +772,12 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt
;
; GFX11-LABEL: v_uint_to_fp_v4i64_to_v4f16:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v5, 5, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b128 v[1:4], v5, s[2:3] offset:16
-; GFX11-NEXT: global_load_b128 v[5:8], v5, s[2:3]
+; GFX11-NEXT: global_load_b128 v[1:4], v5, s[6:7] offset:16
+; GFX11-NEXT: global_load_b128 v[5:8], v5, s[6:7]
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_clz_i32_u32_e32 v9, v4
; GFX11-NEXT: v_clz_i32_u32_e32 v10, v2
@@ -825,7 +825,7 @@ define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(ptr addrspace(1) %out, pt
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_pack_b32_f16 v1, v1, v3
; GFX11-NEXT: v_pack_b32_f16 v0, v4, v2
-; GFX11-NEXT: global_store_b64 v5, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v5, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
index 5f8d0f665a953..f4debc2706ce0 100644
--- a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
@@ -25,38 +25,38 @@ define amdgpu_kernel void @uitofp_i16_to_f16(
;
; VI-LABEL: uitofp_i16_to_f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f16_u16_e32 v0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: uitofp_i16_to_f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f16_u16_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -91,41 +91,41 @@ define amdgpu_kernel void @uitofp_i32_to_f16(
;
; VI-LABEL: uitofp_i32_to_f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_u32_e32 v0, v0
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: uitofp_i32_to_f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -168,44 +168,44 @@ define amdgpu_kernel void @uitofp_v2i16_to_v2f16(
;
; VI-LABEL: uitofp_v2i16_to_v2f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f16_u16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: v_cvt_f16_u16_e32 v0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: uitofp_v2i16_to_v2f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f16_u16_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f16_u16_e32 v1, v1
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -244,39 +244,39 @@ define amdgpu_kernel void @uitofp_v2i32_to_v2f16(
;
; VI-LABEL: uitofp_v2i32_to_v2f16:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s10, s6
-; VI-NEXT: s_mov_b32 s11, s7
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s10, s2
+; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_mov_b32 s8, s2
-; VI-NEXT: s_mov_b32 s9, s3
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_u32_e32 v1, v1
; VI-NEXT: v_cvt_f32_u32_e32 v0, v0
; VI-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: uitofp_v2i32_to_v2f16:
; GFX11: ; %bb.0: ; %entry
-; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT: s_mov_b32 s6, -1
-; GFX11-NEXT: s_mov_b32 s7, 0x31016000
-; GFX11-NEXT: s_mov_b32 s10, s6
-; GFX11-NEXT: s_mov_b32 s11, s7
+; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s8, s2
-; GFX11-NEXT: s_mov_b32 s9, s3
-; GFX11-NEXT: s_mov_b32 s4, s0
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[8:11], 0
-; GFX11-NEXT: s_mov_b32 s5, s1
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cvt_f32_u32_e32 v1, v1
; GFX11-NEXT: v_cvt_f32_u32_e32 v0, v0
@@ -285,7 +285,7 @@ define amdgpu_kernel void @uitofp_v2i32_to_v2f16(
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -357,20 +357,19 @@ define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
-; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0
; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
@@ -378,9 +377,10 @@ define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_xor_b32 s0, s0, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0
+; GFX11-NEXT: s_mov_b32 s0, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
index f60a274f1e592..5fc395b95e1a5 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
@@ -244,12 +244,12 @@ define amdgpu_kernel void @uniform_if_move_valu(ptr addrspace(1) %out, float %a)
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 5, v0
; VI-NEXT: s_cbranch_vccnz .LBB4_2
; VI-NEXT: ; %bb.1: ; %if
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: .LBB4_2: ; %endif
; VI-NEXT: s_endpgm
entry:
@@ -296,12 +296,12 @@ define amdgpu_kernel void @uniform_if_move_valu_commute(ptr addrspace(1) %out, f
; VI-NEXT: v_cmp_gt_u32_e32 vcc, 6, v0
; VI-NEXT: s_cbranch_vccnz .LBB5_2
; VI-NEXT: ; %bb.1: ; %if
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: .LBB5_2: ; %endif
; VI-NEXT: s_endpgm
entry:
@@ -342,20 +342,19 @@ define amdgpu_kernel void @uniform_if_else_ret(ptr addrspace(1) nocapture %out,
; VI-LABEL: uniform_if_else_ret:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cbranch_scc0 .LBB6_2
; VI-NEXT: ; %bb.1: ; %if.else
-; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: v_mov_b32_e32 v0, 2
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
; VI-NEXT: .LBB6_2: ; %if.then
-; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: v_mov_b32_e32 v0, 1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
entry:
%cmp = icmp eq i32 %a, 0
@@ -403,28 +402,29 @@ define amdgpu_kernel void @uniform_if_else(ptr addrspace(1) nocapture %out0, ptr
;
; VI-LABEL: uniform_if_else:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dword s4, s[0:1], 0x34
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x34
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_cmp_lg_u32 s4, 0
+; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cbranch_scc0 .LBB7_2
; VI-NEXT: ; %bb.1: ; %if.else
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_mov_b32_e32 v0, 2
; VI-NEXT: s_branch .LBB7_3
; VI-NEXT: .LBB7_2: ; %if.then
-; VI-NEXT: s_mov_b32 s4, s0
-; VI-NEXT: s_mov_b32 s5, s1
+; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s0, s4
+; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_mov_b32_e32 v0, 1
; VI-NEXT: .LBB7_3: ; %if.end
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_mov_b32 s0, s6
+; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: v_mov_b32_e32 v0, 3
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
entry:
%cmp = icmp eq i32 %a, 0
@@ -530,13 +530,13 @@ define amdgpu_kernel void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, p
; VI-NEXT: .LBB9_2: ; %bb9
; VI-NEXT: s_endpgm
; VI-NEXT: .LBB9_3: ; %bb7
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3]
; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v1
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
@@ -626,20 +626,20 @@ define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 %
; VI-NEXT: s_and_saveexec_b64 s[2:3], vcc
; VI-NEXT: s_cbranch_execz .LBB11_2
; VI-NEXT: ; %bb.1: ; %if
-; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_cmp_lg_u32 s4, 0
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: s_cmp_lg_u32 s2, 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_cbranch_scc0 .LBB11_3
; VI-NEXT: .LBB11_2: ; %endif
; VI-NEXT: s_endpgm
; VI-NEXT: .LBB11_3: ; %if_uniform
; VI-NEXT: v_mov_b32_e32 v0, 1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
@@ -692,18 +692,18 @@ define amdgpu_kernel void @divergent_inside_uniform(ptr addrspace(1) %out, i32 %
; VI-NEXT: .LBB12_1: ; %endif
; VI-NEXT: s_endpgm
; VI-NEXT: .LBB12_2: ; %if
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0
-; VI-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0
+; VI-NEXT: s_and_saveexec_b64 s[0:1], vcc
; VI-NEXT: s_cbranch_execz .LBB12_1
; VI-NEXT: ; %bb.3: ; %if_uniform
; VI-NEXT: v_mov_b32_e32 v0, 1
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
entry:
%u_cmp = icmp eq i32 %cond, 0
@@ -832,16 +832,16 @@ define amdgpu_kernel void @cse_uniform_condition_different_blocks(i32 %cond, ptr
; VI-NEXT: s_cmp_lt_i32 s2, 1
; VI-NEXT: s_cbranch_scc1 .LBB14_2
; VI-NEXT: ; %bb.1: ; %bb2
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
-; VI-NEXT: s_mov_b32 s3, 0xf000
-; VI-NEXT: s_mov_b32 s2, -1
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c
+; VI-NEXT: s_mov_b32 s7, 0xf000
+; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
+; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: .LBB14_2: ; %bb9
; VI-NEXT: s_endpgm
bb:
@@ -886,20 +886,20 @@ define amdgpu_kernel void @uniform_if_scc_i64_eq(i64 %cond, ptr addrspace(1) %ou
;
; VI-LABEL: uniform_if_scc_i64_eq:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_cmp_eq_u64 s[0:1], 0
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_eq_u64 s[4:5], 0
; VI-NEXT: s_cbranch_scc1 .LBB15_2
; VI-NEXT: ; %bb.1: ; %else
; VI-NEXT: s_mov_b32 s0, 1
; VI-NEXT: .LBB15_2: ; %done
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s11, 0xf000
+; VI-NEXT: s_mov_b32 s10, -1
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
entry:
%cmp0 = icmp eq i64 %cond, 0
@@ -940,20 +940,20 @@ define amdgpu_kernel void @uniform_if_scc_i64_ne(i64 %cond, ptr addrspace(1) %ou
;
; VI-LABEL: uniform_if_scc_i64_ne:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_cmp_lg_u64 s[0:1], 0
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s0, 0
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_cmp_lg_u64 s[4:5], 0
; VI-NEXT: s_cbranch_scc1 .LBB16_2
; VI-NEXT: ; %bb.1: ; %else
; VI-NEXT: s_mov_b32 s0, 1
; VI-NEXT: .LBB16_2: ; %done
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s11, 0xf000
+; VI-NEXT: s_mov_b32 s10, -1
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
entry:
%cmp0 = icmp ne i64 %cond, 0
@@ -994,21 +994,21 @@ define amdgpu_kernel void @uniform_if_scc_i64_sgt(i64 %cond, ptr addrspace(1) %o
;
; VI-LABEL: uniform_if_scc_i64_sgt:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_cmp_gt_i64_e64 s[4:5], s[0:1], 0
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s0, 0
-; VI-NEXT: s_and_b64 vcc, exec, s[4:5]
+; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_cmp_gt_i64_e64 s[2:3], s[4:5], 0
+; VI-NEXT: s_and_b64 vcc, exec, s[2:3]
; VI-NEXT: s_cbranch_vccnz .LBB17_2
; VI-NEXT: ; %bb.1: ; %else
; VI-NEXT: s_mov_b32 s0, 1
; VI-NEXT: .LBB17_2: ; %done
-; VI-NEXT: s_mov_b32 s7, 0xf000
-; VI-NEXT: s_mov_b32 s6, -1
-; VI-NEXT: s_mov_b32 s4, s2
-; VI-NEXT: s_mov_b32 s5, s3
+; VI-NEXT: s_mov_b32 s11, 0xf000
+; VI-NEXT: s_mov_b32 s10, -1
+; VI-NEXT: s_mov_b32 s8, s6
+; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
entry:
%cmp0 = icmp sgt i64 %cond, 0
diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll
index 666ae7c126ae3..092d74f4f40fd 100644
--- a/llvm/test/CodeGen/AMDGPU/usubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubo.ll
@@ -29,12 +29,12 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; VI-LABEL: s_usubo_i64_zext:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: s_sub_u32 s0, s6, s0
+; VI-NEXT: s_sub_u32 s0, s6, s2
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: s_subb_u32 s1, s7, s1
+; VI-NEXT: s_subb_u32 s1, s7, s3
; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
@@ -96,12 +96,12 @@ define amdgpu_kernel void @s_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
; VI-LABEL: s_usubo_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v4, s1
+; VI-NEXT: v_mov_b32_e32 v4, s3
; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_sub_u32_e32 v4, vcc, s0, v4
+; VI-NEXT: v_sub_u32_e32 v4, vcc, s2, v4
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
@@ -161,18 +161,18 @@ define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; VI-LABEL: v_usubo_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: flat_load_dword v4, v[0:1]
+; VI-NEXT: flat_load_dword v5, v[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: flat_load_dword v5, v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v5
; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
@@ -182,16 +182,16 @@ define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: v_usubo_i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
-; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX9-NEXT: global_load_dword v1, v0, s[8:9]
+; GFX9-NEXT: global_load_dword v2, v0, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
-; GFX9-NEXT: global_store_byte v0, v2, s[2:3]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
+; GFX9-NEXT: global_store_byte v0, v2, s[6:7]
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -243,18 +243,18 @@ define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspac
;
; VI-LABEL: v_usubo_i32_novcc:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: flat_load_dword v4, v[0:1]
+; VI-NEXT: flat_load_dword v5, v[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: flat_load_dword v4, v[0:1]
-; VI-NEXT: flat_load_dword v5, v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v5
; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
@@ -268,19 +268,19 @@ define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspac
;
; GFX9-LABEL: v_usubo_i32_novcc:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v1, v0, s[4:5]
-; GFX9-NEXT: global_load_dword v2, v0, s[6:7]
+; GFX9-NEXT: global_load_dword v1, v0, s[8:9]
+; GFX9-NEXT: global_load_dword v2, v0, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ;;#ASMEND
-; GFX9-NEXT: global_store_byte v0, v2, s[2:3]
+; GFX9-NEXT: global_store_byte v0, v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -325,19 +325,19 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
;
; VI-LABEL: s_usubo_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: s_sub_u32 s0, s4, s6
-; VI-NEXT: v_mov_b32_e32 v4, s4
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_subb_u32 s1, s5, s7
-; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: s_sub_u32 s0, s8, s10
+; VI-NEXT: v_mov_b32_e32 v4, s8
+; VI-NEXT: s_subb_u32 s1, s9, s11
+; VI-NEXT: v_mov_b32_e32 v5, s9
; VI-NEXT: v_mov_b32_e32 v7, s1
; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v6, s0
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_mov_b32_e32 v2, s6
+; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_store_dwordx2 v[0:1], v[6:7]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; VI-NEXT: flat_store_byte v[2:3], v0
@@ -345,19 +345,19 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: s_usubo_i64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_sub_u32 s6, s4, s6
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: v_mov_b32_e32 v1, s5
-; GFX9-NEXT: s_subb_u32 s7, s5, s7
-; GFX9-NEXT: v_mov_b32_e32 v2, s6
-; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
-; GFX9-NEXT: v_mov_b32_e32 v3, s7
+; GFX9-NEXT: s_sub_u32 s0, s8, s10
+; GFX9-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-NEXT: s_subb_u32 s1, s9, s11
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
-; GFX9-NEXT: global_store_byte v4, v0, s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
+; GFX9-NEXT: global_store_byte v4, v0, s[6:7]
; GFX9-NEXT: s_endpgm
%usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
%val = extractvalue { i64, i1 } %usub, 0
@@ -401,18 +401,18 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
;
; VI-LABEL: v_usubo_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v6, s2
-; VI-NEXT: v_mov_b32_e32 v7, s3
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v6, s6
+; VI-NEXT: v_mov_b32_e32 v7, s7
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_sub_u32_e32 v2, vcc, v0, v2
; VI-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc
@@ -424,18 +424,18 @@ define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: v_usubo_i64:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[8:9]
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2
; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT: global_store_byte v4, v0, s[2:3]
+; GFX9-NEXT: global_store_byte v4, v0, s[6:7]
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -486,18 +486,18 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
;
; VI-LABEL: v_usubo_i16:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
+; VI-NEXT: flat_load_ushort v4, v[0:1]
+; VI-NEXT: flat_load_ushort v5, v[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
-; VI-NEXT: flat_load_ushort v4, v[0:1]
-; VI-NEXT: flat_load_ushort v5, v[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_sub_u16_e32 v5, v4, v5
; VI-NEXT: v_cmp_gt_u16_e32 vcc, v5, v4
@@ -508,17 +508,17 @@ define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %
;
; GFX9-LABEL: v_usubo_i16:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_ushort v1, v0, s[4:5]
-; GFX9-NEXT: global_load_ushort v2, v0, s[6:7]
+; GFX9-NEXT: global_load_ushort v1, v0, s[8:9]
+; GFX9-NEXT: global_load_ushort v2, v0, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u16_e32 v2, v1, v2
; GFX9-NEXT: v_cmp_gt_u16_e32 vcc, v2, v1
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-NEXT: global_store_short v0, v2, s[0:1]
-; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
+; GFX9-NEXT: global_store_short v0, v2, s[4:5]
+; GFX9-NEXT: global_store_byte v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -568,18 +568,18 @@ define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
;
; VI-LABEL: v_usubo_v2i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: v_mov_b32_e32 v1, s5
-; VI-NEXT: v_mov_b32_e32 v2, s6
-; VI-NEXT: v_mov_b32_e32 v3, s7
+; VI-NEXT: v_mov_b32_e32 v0, s8
+; VI-NEXT: v_mov_b32_e32 v1, s9
+; VI-NEXT: v_mov_b32_e32 v2, s10
+; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
-; VI-NEXT: v_mov_b32_e32 v6, s2
-; VI-NEXT: v_mov_b32_e32 v7, s3
+; VI-NEXT: v_mov_b32_e32 v4, s4
+; VI-NEXT: v_mov_b32_e32 v5, s5
+; VI-NEXT: v_mov_b32_e32 v6, s6
+; VI-NEXT: v_mov_b32_e32 v7, s7
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_sub_u32_e32 v1, vcc, v1, v3
; VI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
@@ -591,18 +591,18 @@ define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1)
;
; GFX9-LABEL: v_usubo_v2i32:
; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
-; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[8:9]
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
-; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3]
+; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
+; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7]
; GFX9-NEXT: s_endpgm
%a = load <2 x i32>, ptr addrspace(1) %aptr, align 4
%b = load <2 x i32>, ptr addrspace(1) %bptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll b/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll
index ca4d689156b49..f20a92df04622 100644
--- a/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_add_u64_pseudo_sdwa.ll
@@ -25,16 +25,16 @@ bb:
define amdgpu_kernel void @test_add_co_sdwa(ptr addrspace(1) %arg, ptr addrspace(1) %arg1) #0 {
; GFX9-LABEL: test_add_co_sdwa:
; GFX9: ; %bb.0: ; %bb
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v4, v2, s[2:3]
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1]
+; GFX9-NEXT: global_load_dword v4, v2, s[6:7]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v3, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_co_u32_sdwa v0, vcc, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
index e5de7d0a74642..27dcdf9f0bbda 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -40,15 +40,15 @@ define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr a
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_cmp_eq_u32 s2, 0
+; VI-NEXT: s_cmp_eq_u32 s4, 0
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -79,14 +79,14 @@ define amdgpu_kernel void @v_cnd_nan_nosgpr(ptr addrspace(1) %out, i32 %c, ptr a
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_cmp_eq_u32 s2, 0
+; GFX11-NEXT: s_cmp_eq_u32 s4, 0
; GFX11-NEXT: s_cselect_b64 vcc, -1, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -185,14 +185,14 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o
; VI-LABEL: fcmp_sgprX_k0_select_k1_sgprZ_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v2, s1
-; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0
+; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0
; VI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -213,13 +213,13 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(ptr addrspace(1) %o
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s2, 0
+; GFX11-NEXT: v_cmp_nlg_f32_e64 s[0:1], s2, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s3, s[4:5]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s3, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -279,13 +279,13 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(ptr addrspace(1) %o
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0
+; GFX11-NEXT: v_cmp_nlg_f32_e64 s[0:1], s4, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, s[2:3]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 1.0, s4, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -317,14 +317,14 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %o
; VI-LABEL: fcmp_sgprX_k0_select_k0_sgprZ_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x4c
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x4c
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v2, s1
-; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s0, 0
+; VI-NEXT: v_mov_b32_e32 v2, s5
+; VI-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0
; VI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -345,13 +345,13 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(ptr addrspace(1) %o
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x4c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlg_f32_e64 s[4:5], s2, 0
+; GFX11-NEXT: v_cmp_nlg_f32_e64 s[0:1], s2, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s3, s[4:5]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s3, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -411,13 +411,13 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(ptr addrspace(1) %o
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlg_f32_e64 s[2:3], s4, 0
+; GFX11-NEXT: v_cmp_nlg_f32_e64 s[0:1], s4, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s4, s[2:3]
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, s4, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -494,13 +494,13 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(ptr addrspace(1) %o
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0
+; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -579,13 +579,13 @@ define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s2, 0
+; GFX11-NEXT: v_cmp_nlg_f32_e64 vcc, s4, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
-; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: global_store_b32 v0, v1, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -703,14 +703,14 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o
; VI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -744,12 +744,12 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(ptr addrspace(1) %o
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_le_f32_e32 vcc, 0, v1
; GFX11-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
@@ -795,14 +795,14 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %o
; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -836,12 +836,12 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(ptr addrspace(1) %o
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1
; GFX11-NEXT: v_cndmask_b32_e32 v1, 2, v2, vcc
@@ -888,14 +888,14 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o
; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -931,12 +931,12 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(ptr addrspace(1) %o
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b64 v[2:3], v4, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[0:1]
; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
@@ -988,15 +988,15 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1)
; VI-LABEL: fcmp_vgprX_k0_select_vgprZ_k1_v4f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; VI-NEXT: v_mov_b32_e32 v0, s1
-; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v5
+; VI-NEXT: v_mov_b32_e32 v0, s3
+; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v5
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc
; VI-NEXT: flat_load_dword v6, v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1037,13 +1037,13 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v5, v1, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v4, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_nge_f32_e32 vcc, 4.0, v5
; GFX11-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
@@ -1097,15 +1097,15 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1)
; VI-LABEL: fcmp_vgprX_k0_select_k1_vgprZ_v4f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; VI-NEXT: v_mov_b32_e32 v0, s1
-; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v5
+; VI-NEXT: v_mov_b32_e32 v0, s3
+; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v5
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc
; VI-NEXT: flat_load_dword v6, v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1146,13 +1146,13 @@ define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v5, v1, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v4, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_ge_f32_e32 vcc, 4.0, v5
; GFX11-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
@@ -1208,15 +1208,15 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1)
; VI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_v4f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: v_lshlrev_b32_e32 v5, 4, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; VI-NEXT: v_mov_b32_e32 v0, s1
-; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v5
+; VI-NEXT: v_mov_b32_e32 v0, s3
+; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v5
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc
; VI-NEXT: flat_load_dword v6, v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1257,13 +1257,13 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v5, v1, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b128 v[0:3], v4, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b128 v[0:3], v4, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_le_f32_e32 vcc, 4.0, v5
; GFX11-NEXT: v_cndmask_b32_e32 v3, 4.0, v3, vcc
@@ -1316,14 +1316,14 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou
; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; VI-NEXT: v_mov_b32_e32 v4, s1
-; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v4, s3
+; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc
; VI-NEXT: flat_load_dword v2, v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1363,12 +1363,12 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v1, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_u8 v2, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_u8 v2, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1
; GFX11-NEXT: v_and_b32_e32 v2, 1, v2
@@ -1424,15 +1424,15 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1)
; VI-LABEL: fcmp_vgprX_k0_selectf64_k1_vgprZ_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: v_lshlrev_b32_e32 v5, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; VI-NEXT: v_mov_b32_e32 v0, s1
-; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v5
+; VI-NEXT: v_mov_b32_e32 v0, s3
+; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v5
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc
; VI-NEXT: flat_load_dword v6, v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1470,13 +1470,13 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v1, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_le_f32_e32 vcc, 0, v3
; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3ff00000, v1, vcc
@@ -1527,15 +1527,15 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1)
; VI-LABEL: fcmp_vgprX_k0_selecti64_k1_vgprZ_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; VI-NEXT: v_lshlrev_b32_e32 v5, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s7
; VI-NEXT: v_add_u32_e32 v1, vcc, s6, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; VI-NEXT: v_mov_b32_e32 v0, s1
-; VI-NEXT: v_add_u32_e32 v3, vcc, s0, v5
+; VI-NEXT: v_mov_b32_e32 v0, s3
+; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v5
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v0, vcc
; VI-NEXT: flat_load_dword v6, v[1:2] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1572,13 +1572,13 @@ define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v1, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b64 v[0:1], v2, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b64 v[0:1], v2, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v3
; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -1626,14 +1626,14 @@ define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1)
; VI-LABEL: icmp_vgprX_k0_selectf32_k1_vgprZ_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1667,12 +1667,12 @@ define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(ptr addrspace(1)
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_gt_u32_e32 vcc, 2, v1
; GFX11-NEXT: v_cndmask_b32_e32 v1, 4.0, v2, vcc
@@ -1723,14 +1723,14 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr add
; VI-LABEL: fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
+; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v5, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1772,12 +1772,12 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(ptr add
; GFX11: ; %bb.0:
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x34
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] glc dlc
+; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_cmp_nle_f32_e32 vcc, 4.0, v1
; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, -1.0, vcc
@@ -1839,17 +1839,17 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c,
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v0, v[0:1]
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_cmp_lg_u32 s2, 0
+; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_and_b32_e32 v1, 0x7fff, v0
; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1882,17 +1882,17 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f16(ptr addrspace(1) %out, i32 %c,
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_u16 v0, v0, s[2:3]
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-NEXT: s_cmp_lg_u32 s4, 0
; GFX11-NEXT: s_cselect_b64 vcc, -1, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v0
; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX11-NEXT: global_store_b16 v2, v0, s[0:1]
+; GFX11-NEXT: global_store_b16 v2, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -1937,15 +1937,15 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c,
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_cmp_lg_u32 s2, 0
-; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
+; VI-NEXT: s_cmp_lg_u32 s4, 0
+; VI-NEXT: s_cselect_b64 s[0:1], -1, 0
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_cndmask_b32_e64 v2, -v0, |v0|, s[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_cndmask_b32_e64 v2, -v0, |v0|, s[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
@@ -1976,14 +1976,14 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f32(ptr addrspace(1) %out, i32 %c,
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s2, 0
-; GFX11-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GFX11-NEXT: s_cmp_lg_u32 s4, 0
+; GFX11-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[2:3]
-; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: v_cndmask_b32_e64 v0, -v0, |v0|, s[0:1]
+; GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -2032,18 +2032,18 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c,
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_cmp_lg_u32 s2, 0
+; VI-NEXT: s_cmp_lg_u32 s4, 0
; VI-NEXT: s_cselect_b64 vcc, -1, 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1
; VI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
;
@@ -2077,10 +2077,10 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c,
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[2:3]
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_cmp_lg_u32 s2, 0
+; GFX11-NEXT: s_cmp_lg_u32 s4, 0
; GFX11-NEXT: s_cselect_b64 vcc, -1, 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1
@@ -2088,7 +2088,7 @@ define amdgpu_kernel void @v_cndmask_abs_neg_f64(ptr addrspace(1) %out, i32 %c,
; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v0, vcc
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1]
+; GFX11-NEXT: global_store_b64 v3, v[0:1], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
index f7933d719f989..4b76d5caabe66 100644
--- a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
@@ -56,25 +56,25 @@ define amdgpu_kernel void @madak_f16(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT: s_mov_b32 s10, -1
-; GFX11-NEXT: s_mov_b32 s11, 0x31016000
-; GFX11-NEXT: s_mov_b32 s14, s10
-; GFX11-NEXT: s_mov_b32 s15, s11
-; GFX11-NEXT: s_mov_b32 s2, s10
-; GFX11-NEXT: s_mov_b32 s3, s11
+; GFX11-NEXT: s_load_b64 s[8:9], s[0:1], 0x34
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s12, s6
; GFX11-NEXT: s_mov_b32 s13, s7
; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0
-; GFX11-NEXT: buffer_load_u16 v1, off, s[0:3], 0
-; GFX11-NEXT: s_mov_b32 s8, s4
-; GFX11-NEXT: s_mov_b32 s9, s5
+; GFX11-NEXT: buffer_load_u16 v1, off, s[8:11], 0
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_f16_e32 v0, 0x4900, v0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -170,15 +170,15 @@ define amdgpu_kernel void @madak_f16_use_2(
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b256 s[4:11], s[0:1], 0x24
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x44
-; GFX11-NEXT: s_mov_b32 s14, -1
-; GFX11-NEXT: s_mov_b32 s15, 0x31016000
-; GFX11-NEXT: s_mov_b32 s18, s14
-; GFX11-NEXT: s_mov_b32 s19, s15
-; GFX11-NEXT: s_mov_b32 s22, s14
-; GFX11-NEXT: s_mov_b32 s23, s15
-; GFX11-NEXT: s_mov_b32 s2, s14
-; GFX11-NEXT: s_mov_b32 s3, s15
+; GFX11-NEXT: s_load_b64 s[12:13], s[0:1], 0x44
+; GFX11-NEXT: s_mov_b32 s2, -1
+; GFX11-NEXT: s_mov_b32 s3, 0x31016000
+; GFX11-NEXT: s_mov_b32 s18, s2
+; GFX11-NEXT: s_mov_b32 s19, s3
+; GFX11-NEXT: s_mov_b32 s22, s2
+; GFX11-NEXT: s_mov_b32 s23, s3
+; GFX11-NEXT: s_mov_b32 s14, s2
+; GFX11-NEXT: s_mov_b32 s15, s3
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s16, s8
; GFX11-NEXT: s_mov_b32 s17, s9
@@ -188,19 +188,21 @@ define amdgpu_kernel void @madak_f16_use_2(
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_load_u16 v1, off, s[20:23], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_load_u16 v2, off, s[0:3], 0 glc dlc
+; GFX11-NEXT: buffer_load_u16 v2, off, s[12:15], 0 glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_mov_b32 s12, s4
-; GFX11-NEXT: s_mov_b32 s13, s5
-; GFX11-NEXT: s_mov_b32 s0, s6
-; GFX11-NEXT: s_mov_b32 s1, s7
+; GFX11-NEXT: s_mov_b32 s0, s4
+; GFX11-NEXT: s_mov_b32 s1, s5
+; GFX11-NEXT: s_mov_b32 s10, s2
+; GFX11-NEXT: s_mov_b32 s11, s3
+; GFX11-NEXT: s_mov_b32 s8, s6
+; GFX11-NEXT: s_mov_b32 s9, s7
; GFX11-NEXT: v_mul_f16_e32 v1, v0, v1
; GFX11-NEXT: v_mul_f16_e32 v0, v0, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_add_f16_e32 v1, 0x4900, v1
; GFX11-NEXT: v_add_f16_e32 v0, 0x4900, v0
-; GFX11-NEXT: buffer_store_b16 v1, off, s[12:15], 0
-; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b16 v1, off, s[0:3], 0
+; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/v_pack.ll b/llvm/test/CodeGen/AMDGPU/v_pack.ll
index 8bc8fbd0e0e84..c2abd4f8a7080 100644
--- a/llvm/test/CodeGen/AMDGPU/v_pack.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_pack.ll
@@ -7,12 +7,12 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
; GCN-LABEL: v_pack_b32_v2f16:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GCN-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
+; GCN-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_add_f16_e32 v0, 2.0, v1
; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2
@@ -24,12 +24,12 @@ define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace
;
; GISEL-LABEL: v_pack_b32_v2f16:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GISEL-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
+; GISEL-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1
; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2
@@ -56,12 +56,12 @@ define amdgpu_kernel void @v_pack_b32_v2f16(ptr addrspace(1) %in0, ptr addrspace
define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
; GCN-LABEL: v_pack_b32_v2f16_sub:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GCN-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
+; GCN-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_subrev_f16_e32 v0, 2.0, v1
; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2
@@ -73,12 +73,12 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrs
;
; GISEL-LABEL: v_pack_b32_v2f16_sub:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GISEL-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
+; GISEL-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_subrev_f16_e32 v0, 2.0, v1
; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2
@@ -105,36 +105,36 @@ define amdgpu_kernel void @v_pack_b32_v2f16_sub(ptr addrspace(1) %in0, ptr addrs
define amdgpu_kernel void @fptrunc(
; GCN-LABEL: fptrunc:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT: s_mov_b32 s6, -1
-; GCN-NEXT: s_mov_b32 s7, 0x31016000
-; GCN-NEXT: s_mov_b32 s10, s6
-; GCN-NEXT: s_mov_b32 s11, s7
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0x31016000
+; GCN-NEXT: s_mov_b32 s10, s2
+; GCN-NEXT: s_mov_b32 s11, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s8, s2
-; GCN-NEXT: s_mov_b32 s9, s3
-; GCN-NEXT: s_mov_b32 s4, s0
+; GCN-NEXT: s_mov_b32 s8, s6
+; GCN-NEXT: s_mov_b32 s9, s7
+; GCN-NEXT: s_mov_b32 s0, s4
; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
-; GCN-NEXT: s_mov_b32 s5, s1
+; GCN-NEXT: s_mov_b32 s1, s5
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_pack_b32_f16 v0, v0, v1
-; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
; GISEL-LABEL: fptrunc:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GISEL-NEXT: s_mov_b32 s6, -1
+; GISEL-NEXT: s_mov_b32 s7, 0x31016000
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: v_cvt_f16_f32_e32 v0, s2
-; GISEL-NEXT: v_cvt_f16_f32_e32 v1, s3
-; GISEL-NEXT: s_mov_b32 s2, -1
-; GISEL-NEXT: s_mov_b32 s3, 0x31016000
+; GISEL-NEXT: v_cvt_f16_f32_e32 v0, s0
+; GISEL-NEXT: v_cvt_f16_f32_e32 v1, s1
; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
-; GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GISEL-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GISEL-NEXT: s_endpgm
ptr addrspace(1) %r,
ptr addrspace(1) %a) {
@@ -147,12 +147,12 @@ define amdgpu_kernel void @fptrunc(
define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
; GCN-LABEL: v_pack_b32.fabs:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GCN-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
+; GCN-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_add_f16_e32 v0, 2.0, v1
; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2
@@ -164,12 +164,12 @@ define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace(
;
; GISEL-LABEL: v_pack_b32.fabs:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GISEL-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
+; GISEL-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1
; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2
@@ -198,12 +198,12 @@ define amdgpu_kernel void @v_pack_b32.fabs(ptr addrspace(1) %in0, ptr addrspace(
define amdgpu_kernel void @v_pack_b32.fneg(ptr addrspace(1) %in0, ptr addrspace(1) %in1) #0 {
; GCN-LABEL: v_pack_b32.fneg:
; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GCN-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
+; GCN-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_add_f16_e32 v0, 2.0, v1
; GCN-NEXT: v_add_f16_e32 v1, 2.0, v2
@@ -215,12 +215,12 @@ define amdgpu_kernel void @v_pack_b32.fneg(ptr addrspace(1) %in0, ptr addrspace(
;
; GISEL-LABEL: v_pack_b32.fneg:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-NEXT: global_load_ushort v1, v0, s[0:1] glc dlc
+; GISEL-NEXT: global_load_ushort v1, v0, s[4:5] glc dlc
; GISEL-NEXT: s_waitcnt vmcnt(0)
-; GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc
+; GISEL-NEXT: global_load_ushort v2, v0, s[6:7] glc dlc
; GISEL-NEXT: s_waitcnt vmcnt(0)
; GISEL-NEXT: v_add_f16_e32 v0, 2.0, v1
; GISEL-NEXT: v_add_f16_e32 v1, 2.0, v2
diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
index 7f69c4733b38a..6c8f28881f175 100644
--- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
@@ -413,18 +413,18 @@ define <2 x i16> @vec_smax_smin(<2 x i16> %src) {
define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> inreg %src) {
; SDAG-VI-LABEL: vec_smax_smin_sgpr:
; SDAG-VI: ; %bb.0:
-; SDAG-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; SDAG-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; SDAG-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; SDAG-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; SDAG-VI-NEXT: v_mov_b32_e32 v0, 0xff
; SDAG-VI-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-VI-NEXT: s_lshr_b32 s3, s2, 16
-; SDAG-VI-NEXT: v_max_i16_e64 v1, s2, 0
-; SDAG-VI-NEXT: v_max_i16_e64 v2, s3, 0
+; SDAG-VI-NEXT: s_lshr_b32 s0, s4, 16
+; SDAG-VI-NEXT: v_max_i16_e64 v1, s4, 0
+; SDAG-VI-NEXT: v_max_i16_e64 v2, s0, 0
; SDAG-VI-NEXT: v_min_i16_e32 v1, 0xff, v1
; SDAG-VI-NEXT: v_min_i16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; SDAG-VI-NEXT: v_or_b32_e32 v2, v1, v0
-; SDAG-VI-NEXT: v_mov_b32_e32 v0, s0
-; SDAG-VI-NEXT: v_mov_b32_e32 v1, s1
+; SDAG-VI-NEXT: v_mov_b32_e32 v0, s2
+; SDAG-VI-NEXT: v_mov_b32_e32 v1, s3
; SDAG-VI-NEXT: flat_store_dword v[0:1], v2
; SDAG-VI-NEXT: s_endpgm
;
@@ -443,41 +443,41 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i
; SDAG-GFX11-LABEL: vec_smax_smin_sgpr:
; SDAG-GFX11: ; %bb.0:
; SDAG-GFX11-NEXT: s_clause 0x1
-; SDAG-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; SDAG-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; SDAG-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; SDAG-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; SDAG-GFX11-NEXT: v_mov_b32_e32 v1, 0
; SDAG-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT: v_pk_max_i16 v0, s2, 0
+; SDAG-GFX11-NEXT: v_pk_max_i16 v0, s4, 0
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; SDAG-GFX11-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1]
-; SDAG-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; SDAG-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; SDAG-GFX11-NEXT: s_nop 0
; SDAG-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; SDAG-GFX11-NEXT: s_endpgm
;
; GISEL-VI-LABEL: vec_smax_smin_sgpr:
; GISEL-VI: ; %bb.0:
-; GISEL-VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; GISEL-VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GISEL-VI-NEXT: s_sext_i32_i16 s3, 0
+; GISEL-VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; GISEL-VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GISEL-VI-NEXT: s_sext_i32_i16 s0, 0
; GISEL-VI-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-VI-NEXT: s_lshr_b32 s4, s2, 16
-; GISEL-VI-NEXT: s_sext_i32_i16 s2, s2
+; GISEL-VI-NEXT: s_lshr_b32 s1, s4, 16
; GISEL-VI-NEXT: s_sext_i32_i16 s4, s4
-; GISEL-VI-NEXT: s_max_i32 s2, s2, s3
-; GISEL-VI-NEXT: s_max_i32 s3, s4, s3
+; GISEL-VI-NEXT: s_sext_i32_i16 s1, s1
+; GISEL-VI-NEXT: s_max_i32 s4, s4, s0
+; GISEL-VI-NEXT: s_max_i32 s0, s1, s0
+; GISEL-VI-NEXT: s_sext_i32_i16 s1, s4
; GISEL-VI-NEXT: s_sext_i32_i16 s4, 0xff
-; GISEL-VI-NEXT: s_sext_i32_i16 s3, s3
-; GISEL-VI-NEXT: s_sext_i32_i16 s2, s2
-; GISEL-VI-NEXT: s_min_i32 s3, s3, s4
-; GISEL-VI-NEXT: s_min_i32 s2, s2, s4
-; GISEL-VI-NEXT: s_and_b32 s3, 0xffff, s3
-; GISEL-VI-NEXT: s_and_b32 s2, 0xffff, s2
-; GISEL-VI-NEXT: s_lshl_b32 s3, s3, 16
-; GISEL-VI-NEXT: s_or_b32 s2, s2, s3
-; GISEL-VI-NEXT: v_mov_b32_e32 v0, s0
-; GISEL-VI-NEXT: v_mov_b32_e32 v2, s2
-; GISEL-VI-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-VI-NEXT: s_sext_i32_i16 s0, s0
+; GISEL-VI-NEXT: s_min_i32 s0, s0, s4
+; GISEL-VI-NEXT: s_min_i32 s1, s1, s4
+; GISEL-VI-NEXT: s_and_b32 s0, 0xffff, s0
+; GISEL-VI-NEXT: s_and_b32 s1, 0xffff, s1
+; GISEL-VI-NEXT: s_lshl_b32 s0, s0, 16
+; GISEL-VI-NEXT: s_or_b32 s0, s1, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-VI-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-VI-NEXT: v_mov_b32_e32 v1, s3
; GISEL-VI-NEXT: flat_store_dword v[0:1], v2
; GISEL-VI-NEXT: s_endpgm
;
@@ -506,26 +506,26 @@ define amdgpu_kernel void @vec_smax_smin_sgpr(ptr addrspace(1) %out, <2 x i16> i
; GISEL-GFX11-LABEL: vec_smax_smin_sgpr:
; GISEL-GFX11: ; %bb.0:
; GISEL-GFX11-NEXT: s_clause 0x1
-; GISEL-GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
-; GISEL-GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
-; GISEL-GFX11-NEXT: s_sext_i32_i16 s3, 0
+; GISEL-GFX11-NEXT: s_load_b32 s4, s[0:1], 0x2c
+; GISEL-GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GISEL-GFX11-NEXT: s_sext_i32_i16 s0, 0
; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT: s_sext_i32_i16 s4, s2
-; GISEL-GFX11-NEXT: s_ashr_i32 s2, s2, 16
-; GISEL-GFX11-NEXT: s_max_i32 s3, s4, s3
-; GISEL-GFX11-NEXT: s_max_i32 s2, s2, 0
+; GISEL-GFX11-NEXT: s_sext_i32_i16 s1, s4
+; GISEL-GFX11-NEXT: s_ashr_i32 s4, s4, 16
+; GISEL-GFX11-NEXT: s_max_i32 s0, s1, s0
+; GISEL-GFX11-NEXT: s_max_i32 s1, s4, 0
; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-GFX11-NEXT: s_pack_ll_b32_b16 s2, s3, s2
-; GISEL-GFX11-NEXT: s_sext_i32_i16 s3, 0xff00ff
-; GISEL-GFX11-NEXT: s_sext_i32_i16 s4, s2
-; GISEL-GFX11-NEXT: s_ashr_i32 s2, s2, 16
-; GISEL-GFX11-NEXT: s_min_i32 s3, s4, s3
-; GISEL-GFX11-NEXT: s_min_i32 s2, s2, 0xff
+; GISEL-GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s1
+; GISEL-GFX11-NEXT: s_sext_i32_i16 s1, 0xff00ff
+; GISEL-GFX11-NEXT: s_sext_i32_i16 s4, s0
+; GISEL-GFX11-NEXT: s_ashr_i32 s0, s0, 16
+; GISEL-GFX11-NEXT: s_min_i32 s1, s4, s1
+; GISEL-GFX11-NEXT: s_min_i32 s0, s0, 0xff
; GISEL-GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-GFX11-NEXT: s_pack_ll_b32_b16 s2, s3, s2
-; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s2
-; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-GFX11-NEXT: s_pack_ll_b32_b16 s0, s1, s0
+; GISEL-GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-GFX11-NEXT: global_store_b32 v1, v0, s[2:3]
; GISEL-GFX11-NEXT: s_nop 0
; GISEL-GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GISEL-GFX11-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll b/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll
index d5347f829002d..b60ae193c2458 100644
--- a/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_sub_u64_pseudo_sdwa.ll
@@ -25,16 +25,16 @@ bb:
define amdgpu_kernel void @test_sub_co_sdwa(ptr addrspace(1) %arg, ptr addrspace(1) %arg1) #0 {
; GFX9-LABEL: test_sub_co_sdwa:
; GFX9: ; %bb.0: ; %bb
-; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: global_load_dword v4, v2, s[2:3]
-; GFX9-NEXT: global_load_dwordx2 v[0:1], v3, s[0:1]
+; GFX9-NEXT: global_load_dword v4, v2, s[6:7]
+; GFX9-NEXT: global_load_dwordx2 v[0:1], v3, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_co_u32_sdwa v0, vcc, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NEXT: v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX9-NEXT: global_store_dwordx2 v3, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
index 340f0cdd5d5d0..836b1d40571a9 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
@@ -473,9 +473,9 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s
; SI-NEXT: bb.1.if.then:
; SI-NEXT: successors: %bb.7(0x80000000)
; SI-NEXT: {{ $}}
- ; SI-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.src1.kernarg.offset, align 4, addrspace 4)
- ; SI-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, killed %48, 0, implicit $exec
- ; SI-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
+ ; SI-NEXT: early-clobber %27:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.src1.kernarg.offset, align 4, addrspace 4)
+ ; SI-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 %27.sub0, killed %48, 0, implicit $exec
+ ; SI-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed %27.sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
; SI-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed [[V_ADDC_U32_e64_]], %subreg.sub1
; SI-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8) from %ir.i10, addrspace 1)
; SI-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
@@ -570,9 +570,9 @@ define protected amdgpu_kernel void @nested_waterfalls(ptr addrspace(1) %tex.coe
; SI-NEXT: bb.1.if.then:
; SI-NEXT: successors: %bb.2(0x80000000)
; SI-NEXT: {{ $}}
- ; SI-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.tex.coerce.kernarg.offset, align 4, addrspace 4)
+ ; SI-NEXT: early-clobber %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM_ec killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.tex.coerce.kernarg.offset, align 4, addrspace 4)
; SI-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 3, killed [[COPY1]](s32), implicit $exec
- ; SI-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR killed [[S_LOAD_DWORDX2_IMM]], killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s64) from %ir.idx, addrspace 1)
+ ; SI-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR killed %4, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s64) from %ir.idx, addrspace 1)
; SI-NEXT: [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[GLOBAL_LOAD_DWORDX2_SADDR]], 16, 0, implicit $exec :: (invariant load (s128) from %ir.3 + 16, addrspace 4)
; SI-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub3
; SI-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub2
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index f78b408d78255..ea48047b8d01d 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -286,8 +286,8 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
; GFX906-LABEL: v32i8_liveout:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX906-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX906-NEXT: v_lshlrev_b32_e32 v31, 5, v0
-; GFX906-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX906-NEXT: v_mov_b32_e32 v9, 0
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
@@ -319,7 +319,7 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
; GFX906-NEXT: v_lshrrev_b32_e32 v30, 24, v5
; GFX906-NEXT: v_lshrrev_b32_e32 v32, 16, v5
; GFX906-NEXT: v_lshrrev_b32_e32 v33, 8, v5
-; GFX906-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB5_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
; GFX906-NEXT: global_load_dwordx4 v[1:4], v31, s[6:7] offset:16
@@ -351,7 +351,7 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
; GFX906-NEXT: v_lshrrev_b32_e32 v32, 16, v5
; GFX906-NEXT: v_lshrrev_b32_e32 v33, 8, v5
; GFX906-NEXT: .LBB5_2: ; %bb.2
-; GFX906-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX906-NEXT: v_lshlrev_b16_e32 v30, 8, v30
; GFX906-NEXT: v_lshlrev_b16_e32 v31, 8, v33
; GFX906-NEXT: v_lshlrev_b16_e32 v29, 8, v29
@@ -372,7 +372,7 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
; GFX906-NEXT: v_or_b32_sdwa v6, v6, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX906-NEXT: v_or_b32_sdwa v7, v7, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX906-NEXT: v_or_b32_sdwa v8, v8, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: global_store_dwordx4 v9, v[5:8], s[0:1]
+; GFX906-NEXT: global_store_dwordx4 v9, v[5:8], s[2:3]
; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; GFX906-NEXT: v_lshlrev_b16_e32 v5, 8, v20
; GFX906-NEXT: v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -393,7 +393,7 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
; GFX906-NEXT: v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX906-NEXT: v_or_b32_sdwa v0, v10, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX906-NEXT: v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT: global_store_dwordx4 v9, v[1:4], s[0:1] offset:16
+; GFX906-NEXT: global_store_dwordx4 v9, v[1:4], s[2:3] offset:16
; GFX906-NEXT: s_endpgm
entry:
%idx = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index e12a4beb5dbe5..a033d5d5df434 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -8,26 +8,26 @@
define amdgpu_kernel void @test_vopc_i32(ptr addrspace(1) %arg) {
; GFX1032-LABEL: test_vopc_i32:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX1032-NEXT: global_load_dword v1, v0, s[2:3]
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0, v1
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 2, 1, vcc_lo
-; GFX1032-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032-NEXT: global_store_dword v0, v1, s[2:3]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_vopc_i32:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX1064-NEXT: global_load_dword v1, v0, s[2:3]
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_lt_i32_e32 vcc, 0, v1
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 2, 1, vcc
-; GFX1064-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064-NEXT: global_store_dword v0, v1, s[2:3]
; GFX1064-NEXT: s_endpgm
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %lid
@@ -41,26 +41,26 @@ define amdgpu_kernel void @test_vopc_i32(ptr addrspace(1) %arg) {
define amdgpu_kernel void @test_vopc_f32(ptr addrspace(1) %arg) {
; GFX1032-LABEL: test_vopc_f32:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX1032-NEXT: global_load_dword v1, v0, s[2:3]
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_nge_f32_e32 vcc_lo, 0, v1
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 2.0, 1.0, vcc_lo
-; GFX1032-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032-NEXT: global_store_dword v0, v1, s[2:3]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_vopc_f32:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX1064-NEXT: global_load_dword v1, v0, s[2:3]
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_nge_f32_e32 vcc, 0, v1
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 2.0, 1.0, vcc
-; GFX1064-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064-NEXT: global_store_dword v0, v1, s[2:3]
; GFX1064-NEXT: s_endpgm
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %lid
@@ -101,28 +101,28 @@ define amdgpu_ps void @test_vopc_vcmp(float %x) {
define amdgpu_kernel void @test_vopc_2xf16(ptr addrspace(1) %arg) {
; GFX1032-LABEL: test_vopc_2xf16:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX1032-NEXT: global_load_dword v1, v0, s[2:3]
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_le_f16_sdwa vcc_lo, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
; GFX1032-NEXT: v_cndmask_b32_e32 v1, 0x3c003c00, v1, vcc_lo
-; GFX1032-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032-NEXT: global_store_dword v0, v1, s[2:3]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_vopc_2xf16:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dword v1, v0, s[0:1]
+; GFX1064-NEXT: global_load_dword v1, v0, s[2:3]
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_le_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
; GFX1064-NEXT: v_cndmask_b32_e32 v1, 0x3c003c00, v1, vcc
-; GFX1064-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064-NEXT: global_store_dword v0, v1, s[2:3]
; GFX1064-NEXT: s_endpgm
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x half>, ptr addrspace(1) %arg, i32 %lid
@@ -321,10 +321,10 @@ define amdgpu_kernel void @test_mask_if(ptr addrspace(1) %arg) #0 {
; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB9_2
; GFX1032-NEXT: ; %bb.1: ; %if
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX1032-NEXT: global_store_dword v0, v0, s[2:3]
; GFX1032-NEXT: .LBB9_2: ; %endif
; GFX1032-NEXT: s_endpgm
;
@@ -334,10 +334,10 @@ define amdgpu_kernel void @test_mask_if(ptr addrspace(1) %arg) #0 {
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB9_2
; GFX1064-NEXT: ; %bb.1: ; %if
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_store_dword v0, v0, s[0:1]
+; GFX1064-NEXT: global_store_dword v0, v0, s[2:3]
; GFX1064-NEXT: .LBB9_2: ; %endif
; GFX1064-NEXT: s_endpgm
%lid = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -355,9 +355,9 @@ endif:
define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 {
; GFX1032-LABEL: test_loop_with_if:
; GFX1032: ; %bb.0: ; %bb
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
-; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: s_mov_b32 s0, 0
; GFX1032-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1032-NEXT: s_branch .LBB10_2
; GFX1032-NEXT: .LBB10_1: ; %bb13
@@ -366,25 +366,25 @@ define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 {
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfe, v4
; GFX1032-NEXT: v_add_nc_u32_e32 v1, 1, v4
-; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
; GFX1032-NEXT: s_cbranch_execz .LBB10_8
; GFX1032-NEXT: .LBB10_2: ; %bb2
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: v_cmp_ge_i32_e64 s4, v1, v0
; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, v1, v0
-; GFX1032-NEXT: s_mov_b32 s3, 0
+; GFX1032-NEXT: s_mov_b32 s1, 0
; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB10_4
; GFX1032-NEXT: ; %bb.3: ; %bb5
; GFX1032-NEXT: ; in Loop: Header=BB10_2 Depth=1
; GFX1032-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX1032-NEXT: s_andn2_b32 s4, s4, exec_lo
-; GFX1032-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-NEXT: s_mov_b32 s1, exec_lo
; GFX1032-NEXT: v_lshlrev_b64 v[2:3], 2, v[1:2]
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_add_co_u32 v2, vcc_lo, s0, v2
-; GFX1032-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v3, vcc_lo
+; GFX1032-NEXT: v_add_co_u32 v2, vcc_lo, s2, v2
+; GFX1032-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s3, v3, vcc_lo
; GFX1032-NEXT: global_load_dword v4, v[2:3], off
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_gt_i32_e32 vcc_lo, 11, v4
@@ -399,13 +399,13 @@ define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 {
; GFX1032-NEXT: ; %bb.5: ; %bb11
; GFX1032-NEXT: ; in Loop: Header=BB10_2 Depth=1
; GFX1032-NEXT: v_lshrrev_b32_e32 v4, 31, v1
-; GFX1032-NEXT: s_andn2_b32 s3, s3, exec_lo
+; GFX1032-NEXT: s_andn2_b32 s1, s1, exec_lo
; GFX1032-NEXT: v_add_nc_u32_e32 v4, v1, v4
; GFX1032-NEXT: v_ashrrev_i32_e32 v4, 1, v4
; GFX1032-NEXT: ; %bb.6: ; %Flow1
; GFX1032-NEXT: ; in Loop: Header=BB10_2 Depth=1
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX1032-NEXT: s_and_saveexec_b32 s4, s3
+; GFX1032-NEXT: s_and_saveexec_b32 s4, s1
; GFX1032-NEXT: s_cbranch_execz .LBB10_1
; GFX1032-NEXT: ; %bb.7: ; %bb10
; GFX1032-NEXT: ; in Loop: Header=BB10_2 Depth=1
@@ -417,9 +417,9 @@ define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 {
;
; GFX1064-LABEL: test_loop_with_if:
; GFX1064: ; %bb.0: ; %bb
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX1064-NEXT: s_branch .LBB10_2
; GFX1064-NEXT: .LBB10_1: ; %bb13
@@ -428,8 +428,8 @@ define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 {
; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX1064-NEXT: v_cmp_lt_i32_e32 vcc, 0xfe, v4
; GFX1064-NEXT: v_add_nc_u32_e32 v1, 1, v4
-; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execz .LBB10_8
; GFX1064-NEXT: .LBB10_2: ; %bb2
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -445,8 +445,8 @@ define amdgpu_kernel void @test_loop_with_if(ptr addrspace(1) %arg) #0 {
; GFX1064-NEXT: s_mov_b64 s[4:5], exec
; GFX1064-NEXT: v_lshlrev_b64 v[2:3], 2, v[1:2]
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_add_co_u32 v2, vcc, s0, v2
-; GFX1064-NEXT: v_add_co_ci_u32_e32 v3, vcc, s1, v3, vcc
+; GFX1064-NEXT: v_add_co_u32 v2, vcc, s2, v2
+; GFX1064-NEXT: v_add_co_ci_u32_e32 v3, vcc, s3, v3, vcc
; GFX1064-NEXT: global_load_dword v4, v[2:3], off
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_gt_i32_e32 vcc, 11, v4
@@ -516,43 +516,43 @@ define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) #
; GFX1032-LABEL: test_loop_with_if_else_break:
; GFX1032: ; %bb.0: ; %bb
; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_mov_b32 s2, 0
-; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_mov_b32 s4, 0
+; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo
; GFX1032-NEXT: s_cbranch_execz .LBB11_6
; GFX1032-NEXT: ; %bb.1: ; %.preheader
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_min_u32_e32 v1, 0x100, v0
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-NEXT: s_mov_b32 s3, 0
-; GFX1032-NEXT: ; implicit-def: $sgpr4
+; GFX1032-NEXT: s_mov_b32 s0, 0
+; GFX1032-NEXT: ; implicit-def: $sgpr1
; GFX1032-NEXT: s_branch .LBB11_4
; GFX1032-NEXT: .LBB11_2: ; %bb8
; GFX1032-NEXT: ; in Loop: Header=BB11_4 Depth=1
-; GFX1032-NEXT: s_add_i32 s3, s3, 1
-; GFX1032-NEXT: global_store_dword v2, v0, s[0:1]
-; GFX1032-NEXT: v_cmp_ge_u32_e32 vcc_lo, s3, v1
-; GFX1032-NEXT: s_add_u32 s0, s0, 4
-; GFX1032-NEXT: s_addc_u32 s1, s1, 0
-; GFX1032-NEXT: s_andn2_b32 s4, s4, exec_lo
+; GFX1032-NEXT: s_add_i32 s0, s0, 1
+; GFX1032-NEXT: global_store_dword v2, v0, s[2:3]
+; GFX1032-NEXT: v_cmp_ge_u32_e32 vcc_lo, s0, v1
+; GFX1032-NEXT: s_add_u32 s2, s2, 4
+; GFX1032-NEXT: s_addc_u32 s3, s3, 0
+; GFX1032-NEXT: s_andn2_b32 s1, s1, exec_lo
; GFX1032-NEXT: s_and_b32 s5, vcc_lo, exec_lo
-; GFX1032-NEXT: s_or_b32 s4, s4, s5
+; GFX1032-NEXT: s_or_b32 s1, s1, s5
; GFX1032-NEXT: .LBB11_3: ; %Flow
; GFX1032-NEXT: ; in Loop: Header=BB11_4 Depth=1
-; GFX1032-NEXT: s_and_b32 s5, exec_lo, s4
-; GFX1032-NEXT: s_or_b32 s2, s5, s2
-; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_and_b32 s5, exec_lo, s1
+; GFX1032-NEXT: s_or_b32 s4, s5, s4
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX1032-NEXT: s_cbranch_execz .LBB11_6
; GFX1032-NEXT: .LBB11_4: ; %bb2
; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_load_dword v3, v2, s[0:1]
-; GFX1032-NEXT: s_or_b32 s4, s4, exec_lo
+; GFX1032-NEXT: global_load_dword v3, v2, s[2:3]
+; GFX1032-NEXT: s_or_b32 s1, s1, exec_lo
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_cmp_gt_i32_e32 vcc_lo, 11, v3
; GFX1032-NEXT: s_cbranch_vccz .LBB11_2
; GFX1032-NEXT: ; %bb.5: ; in Loop: Header=BB11_4 Depth=1
-; GFX1032-NEXT: ; implicit-def: $sgpr3
-; GFX1032-NEXT: ; implicit-def: $sgpr0_sgpr1
+; GFX1032-NEXT: ; implicit-def: $sgpr0
+; GFX1032-NEXT: ; implicit-def: $sgpr2_sgpr3
; GFX1032-NEXT: s_branch .LBB11_3
; GFX1032-NEXT: .LBB11_6: ; %.loopexit
; GFX1032-NEXT: s_endpgm
@@ -564,39 +564,39 @@ define amdgpu_kernel void @test_loop_with_if_else_break(ptr addrspace(1) %arg) #
; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX1064-NEXT: s_cbranch_execz .LBB11_6
; GFX1064-NEXT: ; %bb.1: ; %.preheader
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_min_u32_e32 v1, 0x100, v0
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
; GFX1064-NEXT: ; implicit-def: $sgpr4_sgpr5
; GFX1064-NEXT: s_branch .LBB11_4
; GFX1064-NEXT: .LBB11_2: ; %bb8
; GFX1064-NEXT: ; in Loop: Header=BB11_4 Depth=1
; GFX1064-NEXT: s_add_i32 s6, s6, 1
-; GFX1064-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX1064-NEXT: global_store_dword v2, v0, s[2:3]
; GFX1064-NEXT: v_cmp_ge_u32_e32 vcc, s6, v1
-; GFX1064-NEXT: s_add_u32 s0, s0, 4
-; GFX1064-NEXT: s_addc_u32 s1, s1, 0
+; GFX1064-NEXT: s_add_u32 s2, s2, 4
+; GFX1064-NEXT: s_addc_u32 s3, s3, 0
; GFX1064-NEXT: s_andn2_b64 s[4:5], s[4:5], exec
; GFX1064-NEXT: s_and_b64 s[8:9], vcc, exec
; GFX1064-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9]
; GFX1064-NEXT: .LBB11_3: ; %Flow
; GFX1064-NEXT: ; in Loop: Header=BB11_4 Depth=1
; GFX1064-NEXT: s_and_b64 s[8:9], exec, s[4:5]
-; GFX1064-NEXT: s_or_b64 s[2:3], s[8:9], s[2:3]
-; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX1064-NEXT: s_cbranch_execz .LBB11_6
; GFX1064-NEXT: .LBB11_4: ; %bb2
; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dword v3, v2, s[0:1]
+; GFX1064-NEXT: global_load_dword v3, v2, s[2:3]
; GFX1064-NEXT: s_or_b64 s[4:5], s[4:5], exec
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_cmp_gt_i32_e32 vcc, 11, v3
; GFX1064-NEXT: s_cbranch_vccz .LBB11_2
; GFX1064-NEXT: ; %bb.5: ; in Loop: Header=BB11_4 Depth=1
; GFX1064-NEXT: ; implicit-def: $sgpr6
-; GFX1064-NEXT: ; implicit-def: $sgpr0_sgpr1
+; GFX1064-NEXT: ; implicit-def: $sgpr2_sgpr3
; GFX1064-NEXT: s_branch .LBB11_3
; GFX1064-NEXT: .LBB11_6: ; %.loopexit
; GFX1064-NEXT: s_endpgm
@@ -631,26 +631,26 @@ bb8:
define amdgpu_kernel void @test_addc_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 {
; GFX1032-LABEL: test_addc_vop2b:
; GFX1032: ; %bb.0: ; %bb
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_add_co_u32 v0, vcc_lo, v0, s2
-; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
-; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT: v_add_co_u32 v0, vcc_lo, v0, s6
+; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s7, v1, vcc_lo
+; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_addc_vop2b:
; GFX1064: ; %bb.0: ; %bb
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_add_co_u32 v0, vcc, v0, s2
-; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc
-; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT: v_add_co_u32 v0, vcc, v0, s6
+; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s7, v1, vcc
+; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1064-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -664,26 +664,26 @@ bb:
define amdgpu_kernel void @test_subbrev_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 {
; GFX1032-LABEL: test_subbrev_vop2b:
; GFX1032: ; %bb.0: ; %bb
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s2
-; GFX1032-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
-; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s6
+; GFX1032-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s7, v1, vcc_lo
+; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_subbrev_vop2b:
; GFX1064: ; %bb.0: ; %bb
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_sub_co_u32 v0, vcc, v0, s2
-; GFX1064-NEXT: v_subrev_co_ci_u32_e32 v1, vcc, s3, v1, vcc
-; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT: v_sub_co_u32 v0, vcc, v0, s6
+; GFX1064-NEXT: v_subrev_co_ci_u32_e32 v1, vcc, s7, v1, vcc
+; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1064-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -697,26 +697,26 @@ bb:
define amdgpu_kernel void @test_subb_vop2b(ptr addrspace(1) %arg, i64 %arg1) #0 {
; GFX1032-LABEL: test_subb_vop2b:
; GFX1032: ; %bb.0: ; %bb
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX1032-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v0
-; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
-; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s6, v0
+; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s7, v1, vcc_lo
+; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_subb_vop2b:
; GFX1064: ; %bb.0: ; %bb
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX1064-NEXT: global_load_dwordx2 v[0:1], v2, s[4:5]
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v0
-; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
-; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s6, v0
+; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s7, v1, vcc
+; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1064-NEXT: s_endpgm
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -1063,30 +1063,30 @@ bb:
define amdgpu_kernel void @test_div_scale_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX1032-LABEL: test_div_scale_f32:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX1032-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
-; GFX1032-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX1032-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
-; GFX1032-NEXT: v_div_scale_f32 v1, s2, v2, v2, v1
-; GFX1032-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1032-NEXT: v_div_scale_f32 v1, s0, v2, v2, v1
+; GFX1032-NEXT: global_store_dword v0, v1, s[4:5]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_div_scale_f32:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_load_dword v1, v0, s[2:3] glc dlc
+; GFX1064-NEXT: global_load_dword v1, v0, s[6:7] glc dlc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
-; GFX1064-NEXT: global_load_dword v2, v0, s[2:3] offset:4 glc dlc
+; GFX1064-NEXT: global_load_dword v2, v0, s[6:7] offset:4 glc dlc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
-; GFX1064-NEXT: v_div_scale_f32 v1, s[2:3], v2, v2, v1
-; GFX1064-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX1064-NEXT: v_div_scale_f32 v1, s[0:1], v2, v2, v1
+; GFX1064-NEXT: global_store_dword v0, v1, s[4:5]
; GFX1064-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
@@ -1106,30 +1106,32 @@ define amdgpu_kernel void @test_div_scale_f64(ptr addrspace(1) %out, ptr addrspa
; GFX1032: ; %bb.0:
; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc
; GFX1032-NEXT: s_waitcnt vmcnt(0)
; GFX1032-NEXT: v_div_scale_f64 v[0:1], s2, v[0:1], v[2:3], v[0:1]
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_div_scale_f64:
; GFX1064: ; %bb.0:
; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 3, v0
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] glc dlc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 glc dlc
; GFX1064-NEXT: s_waitcnt vmcnt(0)
; GFX1064-NEXT: v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[2:3], v[0:1]
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
; GFX1064-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
@@ -1451,11 +1453,11 @@ define amdgpu_kernel void @test_brcc_i1(ptr addrspace(1) noalias %out, ptr addrs
; GCN-NEXT: s_bitcmp0_b32 s2, 0
; GCN-NEXT: s_cbranch_scc1 .LBB25_2
; GCN-NEXT: ; %bb.1: ; %store
-; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v1, 0xde
; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_store_dword v0, v1, s[0:1]
+; GCN-NEXT: global_store_dword v0, v1, s[2:3]
; GCN-NEXT: .LBB25_2: ; %end
; GCN-NEXT: s_endpgm
%cmp0 = icmp ne i1 %val, 0
@@ -1634,7 +1636,7 @@ define amdgpu_kernel void @test_movrels_extract_neg_offset_vgpr(ptr addrspace(1)
; GFX1032-LABEL: test_movrels_extract_neg_offset_vgpr:
; GFX1032: ; %bb.0: ; %entry
; GFX1032-NEXT: v_add_nc_u32_e32 v0, 0xfffffe00, v0
-; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
@@ -1643,13 +1645,13 @@ define amdgpu_kernel void @test_movrels_extract_neg_offset_vgpr(ptr addrspace(1)
; GFX1032-NEXT: v_cmp_ne_u32_e32 vcc_lo, 3, v0
; GFX1032-NEXT: v_cndmask_b32_e32 v0, 3, v1, vcc_lo
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX1032-NEXT: global_store_dword v2, v0, s[2:3]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_movrels_extract_neg_offset_vgpr:
; GFX1064: ; %bb.0: ; %entry
; GFX1064-NEXT: v_add_nc_u32_e32 v0, 0xfffffe00, v0
-; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -1658,7 +1660,7 @@ define amdgpu_kernel void @test_movrels_extract_neg_offset_vgpr(ptr addrspace(1)
; GFX1064-NEXT: v_cmp_ne_u32_e32 vcc, 3, v0
; GFX1064-NEXT: v_cndmask_b32_e32 v0, 3, v1, vcc
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: global_store_dword v2, v0, s[0:1]
+; GFX1064-NEXT: global_store_dword v2, v0, s[2:3]
; GFX1064-NEXT: s_endpgm
entry:
%id = call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -1704,30 +1706,30 @@ define amdgpu_kernel void @test_set_inactive(ptr addrspace(1) %out, i32 %in) #0
define amdgpu_kernel void @test_set_inactive_64(ptr addrspace(1) %out, i64 %in) #0 {
; GFX1032-LABEL: test_set_inactive_64:
; GFX1032: ; %bb.0:
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_mov_b32_e32 v0, s2
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mov_b32_e32 v0, s6
+; GFX1032-NEXT: v_mov_b32_e32 v1, s7
; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
; GFX1032-NEXT: v_mov_b32_e32 v0, 0
; GFX1032-NEXT: v_mov_b32_e32 v1, 0
; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo
-; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_set_inactive_64:
; GFX1064: ; %bb.0:
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_mov_b32_e32 v0, s2
-; GFX1064-NEXT: v_mov_b32_e32 v1, s3
+; GFX1064-NEXT: v_mov_b32_e32 v0, s6
+; GFX1064-NEXT: v_mov_b32_e32 v1, s7
; GFX1064-NEXT: s_not_b64 exec, exec
; GFX1064-NEXT: v_mov_b32_e32 v0, 0
; GFX1064-NEXT: v_mov_b32_e32 v1, 0
; GFX1064-NEXT: s_not_b64 exec, exec
-; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1064-NEXT: s_endpgm
%tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0)
store i64 %tmp, ptr addrspace(1) %out
@@ -2354,42 +2356,42 @@ define amdgpu_ps float @test_ps_live() #0 {
define amdgpu_kernel void @test_vccnz_ifcvt_triangle64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX1032-LABEL: test_vccnz_ifcvt_triangle64:
; GFX1032: ; %bb.0: ; %entry
-; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1032-NEXT: v_cmp_neq_f64_e64 s4, s[2:3], 1.0
-; GFX1032-NEXT: s_and_b32 vcc_lo, exec_lo, s4
+; GFX1032-NEXT: v_cmp_neq_f64_e64 s2, s[0:1], 1.0
+; GFX1032-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX1032-NEXT: s_cbranch_vccnz .LBB47_2
; GFX1032-NEXT: ; %bb.1: ; %if
-; GFX1032-NEXT: v_add_f64 v[0:1], s[2:3], s[2:3]
+; GFX1032-NEXT: v_add_f64 v[0:1], s[0:1], s[0:1]
; GFX1032-NEXT: s_branch .LBB47_3
; GFX1032-NEXT: .LBB47_2:
-; GFX1032-NEXT: v_mov_b32_e32 v0, s2
-; GFX1032-NEXT: v_mov_b32_e32 v1, s3
+; GFX1032-NEXT: v_mov_b32_e32 v0, s0
+; GFX1032-NEXT: v_mov_b32_e32 v1, s1
; GFX1032-NEXT: .LBB47_3: ; %endif
; GFX1032-NEXT: v_mov_b32_e32 v2, 0
-; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1032-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1032-NEXT: s_endpgm
;
; GFX1064-LABEL: test_vccnz_ifcvt_triangle64:
; GFX1064: ; %bb.0: ; %entry
-; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
-; GFX1064-NEXT: v_cmp_neq_f64_e64 s[4:5], s[2:3], 1.0
-; GFX1064-NEXT: s_and_b64 vcc, exec, s[4:5]
+; GFX1064-NEXT: v_cmp_neq_f64_e64 s[2:3], s[0:1], 1.0
+; GFX1064-NEXT: s_and_b64 vcc, exec, s[2:3]
; GFX1064-NEXT: s_cbranch_vccnz .LBB47_2
; GFX1064-NEXT: ; %bb.1: ; %if
-; GFX1064-NEXT: v_add_f64 v[0:1], s[2:3], s[2:3]
+; GFX1064-NEXT: v_add_f64 v[0:1], s[0:1], s[0:1]
; GFX1064-NEXT: s_branch .LBB47_3
; GFX1064-NEXT: .LBB47_2:
-; GFX1064-NEXT: v_mov_b32_e32 v0, s2
-; GFX1064-NEXT: v_mov_b32_e32 v1, s3
+; GFX1064-NEXT: v_mov_b32_e32 v0, s0
+; GFX1064-NEXT: v_mov_b32_e32 v1, s1
; GFX1064-NEXT: .LBB47_3: ; %endif
; GFX1064-NEXT: v_mov_b32_e32 v2, 0
-; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX1064-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX1064-NEXT: s_endpgm
entry:
%v = load double, ptr addrspace(1) %in
diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
index e0b320aa4f372..025b856e3566e 100644
--- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
@@ -22,11 +22,11 @@ define amdgpu_kernel void @widen_i16_constant_load(ptr addrspace(4) %arg) {
;
; VI-LABEL: widen_i16_constant_load:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_addk_i32 s0, 0x3e7
; VI-NEXT: s_or_b32 s0, s0, 4
@@ -36,10 +36,10 @@ define amdgpu_kernel void @widen_i16_constant_load(ptr addrspace(4) %arg) {
;
; GFX11-LABEL: widen_i16_constant_load:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_addk_i32 s0, 0x3e7
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -76,11 +76,11 @@ define amdgpu_kernel void @widen_i16_constant_load_zext_i32(ptr addrspace(4) %ar
;
; VI-LABEL: widen_i16_constant_load_zext_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s0, s0, 0xffff
; VI-NEXT: s_addk_i32 s0, 0x3e7
@@ -91,10 +91,10 @@ define amdgpu_kernel void @widen_i16_constant_load_zext_i32(ptr addrspace(4) %ar
;
; GFX11-LABEL: widen_i16_constant_load_zext_i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_and_b32 s0, s0, 0xffff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -134,11 +134,11 @@ define amdgpu_kernel void @widen_i16_constant_load_sext_i32(ptr addrspace(4) %ar
;
; VI-LABEL: widen_i16_constant_load_sext_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_sext_i32_i16 s0, s0
; VI-NEXT: s_addk_i32 s0, 0x3e7
@@ -149,10 +149,10 @@ define amdgpu_kernel void @widen_i16_constant_load_sext_i32(ptr addrspace(4) %ar
;
; GFX11-LABEL: widen_i16_constant_load_sext_i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_sext_i32_i16 s0, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -199,13 +199,13 @@ define amdgpu_kernel void @widen_i17_constant_load(ptr addrspace(4) %arg) {
;
; VI-LABEL: widen_i17_constant_load:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, 2
; VI-NEXT: v_mov_b32_e32 v3, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_i32 s0, s0, 34
; VI-NEXT: s_or_b32 s0, s0, 4
@@ -218,10 +218,10 @@ define amdgpu_kernel void @widen_i17_constant_load(ptr addrspace(4) %arg) {
;
; GFX11-LABEL: widen_i17_constant_load:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_i32 s0, s0, 34
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -263,11 +263,11 @@ define amdgpu_kernel void @widen_f16_constant_load(ptr addrspace(4) %arg) {
;
; VI-LABEL: widen_f16_constant_load:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f16_e64 v2, s0, 4.0
; VI-NEXT: flat_store_short v[0:1], v2
@@ -275,11 +275,11 @@ define amdgpu_kernel void @widen_f16_constant_load(ptr addrspace(4) %arg) {
;
; GFX11-LABEL: widen_f16_constant_load:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_f16_e64 v2, s0, 4.0
; GFX11-NEXT: global_store_b16 v[0:1], v2, off
@@ -317,11 +317,11 @@ define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) {
;
; VI-LABEL: widen_v2i8_constant_load:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 44
; VI-NEXT: v_mov_b32_e32 v1, 3
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s1, s0, 0xffff
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -338,9 +338,9 @@ define amdgpu_kernel void @widen_v2i8_constant_load(ptr addrspace(4) %arg) {
;
; GFX11-LABEL: widen_v2i8_constant_load:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_nc_u16 v0, s0, 12
; GFX11-NEXT: v_and_b32_e64 v1, 0xffffff00, s0
@@ -387,11 +387,11 @@ define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4)
;
; VI-LABEL: no_widen_i16_constant_divergent_load:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -404,10 +404,10 @@ define amdgpu_kernel void @no_widen_i16_constant_divergent_load(ptr addrspace(4)
;
; GFX11-LABEL: no_widen_i16_constant_divergent_load:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_u16 v0, v0, s[0:1]
+; GFX11-NEXT: global_load_u16 v0, v0, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_nc_u16 v2, v0, 0x3e7
; GFX11-NEXT: v_mov_b32_e32 v0, 0
@@ -446,11 +446,11 @@ define amdgpu_kernel void @widen_i1_constant_load(ptr addrspace(4) %arg) {
;
; VI-LABEL: widen_i1_constant_load:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s0, s0, 1
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -459,10 +459,10 @@ define amdgpu_kernel void @widen_i1_constant_load(ptr addrspace(4) %arg) {
;
; GFX11-LABEL: widen_i1_constant_load:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_and_b32 s0, s0, 1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -497,11 +497,11 @@ define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(ptr addrspace(4)
;
; VI-LABEL: widen_i16_zextload_i64_constant_load:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s0, s0, 0xffff
; VI-NEXT: s_addk_i32 s0, 0x3e7
@@ -512,10 +512,10 @@ define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(ptr addrspace(4)
;
; GFX11-LABEL: widen_i16_zextload_i64_constant_load:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_and_b32 s0, s0, 0xffff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -556,11 +556,11 @@ define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(ptr addrspace(4) %
;
; VI-LABEL: widen_i1_zext_to_i64_constant_load:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s0, s0, 1
; VI-NEXT: s_add_u32 s0, s0, 0x3e7
@@ -572,9 +572,9 @@ define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(ptr addrspace(4) %
;
; GFX11-LABEL: widen_i1_zext_to_i64_constant_load:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_and_b32 s0, s0, 1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
@@ -667,11 +667,11 @@ define amdgpu_kernel void @widen_i16_global_invariant_load(ptr addrspace(1) %arg
;
; VI-LABEL: widen_i16_global_invariant_load:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_addk_i32 s0, 0x3e7
; VI-NEXT: s_or_b32 s0, s0, 1
@@ -681,10 +681,10 @@ define amdgpu_kernel void @widen_i16_global_invariant_load(ptr addrspace(1) %arg
;
; GFX11-LABEL: widen_i16_global_invariant_load:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT: s_load_b32 s0, s[2:3], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_addk_i32 s0, 0x3e7
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll
index 5422bfa5389c9..54240adf607cd 100644
--- a/llvm/test/CodeGen/AMDGPU/xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/xor.ll
@@ -29,12 +29,12 @@ define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in
; VI-LABEL: xor_v2i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
; VI-NEXT: v_mov_b32_e32 v4, s4
@@ -80,12 +80,12 @@ define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in
; VI-LABEL: xor_v4i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_mov_b32_e32 v4, s0
-; VI-NEXT: v_mov_b32_e32 v5, s1
+; VI-NEXT: v_mov_b32_e32 v4, s2
+; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; VI-NEXT: v_mov_b32_e32 v8, s4
@@ -134,12 +134,12 @@ define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0,
; VI-LABEL: xor_i1:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_load_dword v4, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s4
@@ -190,12 +190,12 @@ define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0
; VI-LABEL: v_xor_i1:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_load_ubyte v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_load_ubyte v2, v[2:3] glc
@@ -239,12 +239,12 @@ define amdgpu_kernel void @vector_xor_i32(ptr addrspace(1) %out, ptr addrspace(1
; VI-LABEL: vector_xor_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_load_dword v4, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s4
@@ -304,13 +304,13 @@ define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %a) {
;
; VI-LABEL: scalar_not_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_not_b32 s2, s2
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: s_not_b32 s0, s4
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
%result = xor i32 %a, -1
@@ -339,13 +339,13 @@ define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: vector_not_i32:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_load_dword v2, v[0:1]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_not_b32_e32 v2, v2
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -384,12 +384,12 @@ define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1
; VI-LABEL: vector_xor_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
; VI-NEXT: v_mov_b32_e32 v4, s4
@@ -425,10 +425,10 @@ define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b)
; VI-LABEL: scalar_xor_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
-; VI-NEXT: s_xor_b64 s[0:1], s[6:7], s[0:1]
+; VI-NEXT: s_xor_b64 s[0:1], s[6:7], s[2:3]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -456,12 +456,12 @@ define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) {
;
; VI-LABEL: scalar_not_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: s_not_b64 s[0:1], s[2:3]
+; VI-NEXT: s_not_b64 s[0:1], s[6:7]
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
@@ -492,13 +492,13 @@ define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1
;
; VI-LABEL: vector_not_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_not_b32_e32 v0, v0
; VI-NEXT: v_not_b32_e32 v1, v1
@@ -545,25 +545,25 @@ define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i
;
; VI-LABEL: xor_cf:
; VI: ; %bb.0: ; %entry
-; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
-; VI-NEXT: s_mov_b64 s[8:9], 0
+; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
+; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_cmp_lg_u64 s[4:5], 0
+; VI-NEXT: s_cmp_lg_u64 s[8:9], 0
; VI-NEXT: s_cbranch_scc0 .LBB12_4
; VI-NEXT: ; %bb.1: ; %else
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: s_andn2_b64 vcc, exec, s[8:9]
+; VI-NEXT: s_andn2_b64 vcc, exec, s[0:1]
; VI-NEXT: s_cbranch_vccnz .LBB12_3
; VI-NEXT: .LBB12_2: ; %if
-; VI-NEXT: s_xor_b64 s[2:3], s[4:5], s[6:7]
+; VI-NEXT: s_xor_b64 s[0:1], s[8:9], s[10:11]
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: .LBB12_3: ; %endif
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -606,14 +606,14 @@ define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i3
; VI-LABEL: scalar_xor_literal_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_xor_b32 s3, s3, 0xf237b
-; VI-NEXT: s_xor_b32 s2, s2, 0x3039
-; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: s_xor_b32 s0, s3, 0xf237b
+; VI-NEXT: s_xor_b32 s1, s2, 0x3039
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v0, s1
+; VI-NEXT: v_mov_b32_e32 v1, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
%or = xor i64 %a, 4261135838621753
@@ -647,15 +647,15 @@ define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %ou
; VI-LABEL: scalar_xor_literal_multi_use_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; VI-NEXT: s_movk_i32 s2, 0x3039
-; VI-NEXT: s_mov_b32 s3, 0xf237b
+; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
+; VI-NEXT: s_movk_i32 s0, 0x3039
+; VI-NEXT: s_mov_b32 s1, 0xf237b
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3]
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1]
+; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: s_add_u32 s0, s6, 0x3039
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_addc_u32 s1, s7, 0xf237b
@@ -689,13 +689,13 @@ define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x
; VI-LABEL: scalar_xor_inline_imm_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_xor_b32 s2, s2, 63
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_xor_b32 s0, s2, 63
+; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
%or = xor i64 %a, 63
@@ -720,13 +720,13 @@ define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out,
; VI-LABEL: scalar_xor_neg_inline_imm_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x4c
-; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_xor_b64 s[2:3], s[2:3], -8
-; VI-NEXT: v_mov_b32_e32 v0, s0
-; VI-NEXT: v_mov_b32_e32 v2, s2
-; VI-NEXT: v_mov_b32_e32 v1, s1
-; VI-NEXT: v_mov_b32_e32 v3, s3
+; VI-NEXT: s_xor_b64 s[0:1], s[2:3], -8
+; VI-NEXT: v_mov_b32_e32 v0, s4
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
%or = xor i64 %a, -8
@@ -756,13 +756,13 @@ define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out,
;
; VI-LABEL: vector_xor_i64_neg_inline_imm:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_xor_b32_e32 v0, -8, v0
; VI-NEXT: v_xor_b32_e32 v1, -1, v1
@@ -796,13 +796,13 @@ define amdgpu_kernel void @vector_xor_literal_i64(ptr addrspace(1) %out, ptr add
;
; VI-LABEL: vector_xor_literal_i64:
; VI: ; %bb.0:
-; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
+; VI-NEXT: v_mov_b32_e32 v0, s6
+; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT: v_mov_b32_e32 v2, s0
-; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v2, s4
+; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_xor_b32_e32 v1, 0x146f, v1
; VI-NEXT: v_xor_b32_e32 v0, 0xdf77987f, v0
diff --git a/llvm/test/CodeGen/AMDGPU/zero_extend.ll b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
index f9137b075e462..af50e09f509a3 100644
--- a/llvm/test/CodeGen/AMDGPU/zero_extend.ll
+++ b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
@@ -53,7 +53,7 @@ define amdgpu_kernel void @s_cmp_zext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i
; GCN-DAG: s_and_b32 [[MASK_A:s[0-9]+]], [[A]], 0xffff{{$}}
; GCN-DAG: s_and_b32 [[MASK_B:s[0-9]+]], [[B]], 0xffff{{$}}
-; GCN: s_cmp_eq_u32 [[MASK_A]], [[B]]
+; GCN: s_cmp_eq_u32 [[MASK_A]], [[MASK_B]]
; GCN: s_cselect_b64 [[CC:s\[[0-9:]+\]]], -1, 0
; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CC]]
; GCN: buffer_store_short [[RESULT]]
More information about the llvm-branch-commits
mailing list